diff --git a/.gitattributes b/.gitattributes index 11e312fd732405653cd8bac54ca5b3924daa3642..2c223cd604b1b594107f0dfb47468c2740e898b3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1444,3 +1444,12 @@ gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1 gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-375-sd-4/checkpoint-82/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-375-sd-4/checkpoint-99/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-375-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..25a6157dd64c1895e96eba28c31bfe1297c5fb71 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81567cf32cadbaf0117b7ad6c654a6b5fedc1775d91f1bdb0c88027d5c8d77d +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ccdc7ae9cde2b8b66e78ffee1100907025132c5c --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6473a088390d144a71fb5d6262cfcb03166ec6325101c01439b478ba9bbc7cda +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7cef0c460bf693e72a2462da8a13edc08f577f4 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be363955abd87a9b2cd189b2be95ffb18b56117f04a81c7d102a2a60fe5bf62 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ffd8abf20b055f04901d4204aae426e69a046e2 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6bd7fae2fa0cba607da0cb07a269842d7aefc88372e6504dbfbc5450fe150fd +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94b80282fc90c2e41c89e7cbf693a65d100a3a26 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331d722674b800e8c81447e4ed79323e1d696462cdb1e33b3951468ac1e10380 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fc9f81339f9adfadbf56e0ec000c8a664d29d5f1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/trainer_state.json @@ -0,0 +1,1071 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.535061483651072e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1467/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..469ee8f5580b54dc41ec605ed1f30225f2fd4009 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5255b8371af121ffd938ad3b296c11bf1b161f300e8b145036ba0aaa36da236e +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..849848428ede06e4d606f182ef19ca8b9ad0b7fc --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d06b6771d2ca9a49988d3e599bb788b097cb6f214f737a432c934551a4d6b6 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c0c1ca4878f51ffb00e95998fd416c72a5508d0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed8108cffd99043e9b0dd148146f3f148b2da7a2ba7574bb45a7036fcb69d26b +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b82681360d9d3e2a4611f6fac274e2484a59eecd --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140316dec3419a0b494998a177826eeb12bf333a3620cbbec09475ff9aad2c93 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..471ad5d16cd7ad12b9adad3c6b8839104bd6ea8d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/trainer_state.json @@ -0,0 +1,1597 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 2.9993183367416494, + "eval_steps": 10, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1302592225476608e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a053039fd404a8af1cca2911cc0aaeabe3d34ea9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15d322a6c4b731d65414a02fb7b29ee75b00db72c16e6c8653c71479e00fa0e1 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7738dc60aa62c1609414c0f2b55306f9b1e09171 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09e218b33248d7341dd93648fcfd8314302bc10aba9cad499e60cdb6caf4e77 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..11d2f37303419521e74c16b6b0a3234dab88fd87 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908b095cc2af9ec1c6d23f35a53f10a3767da3c4e2463cb3255296c45b67b341 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdc465ca7f4d94e4cd869785f8e501d0e0595c78 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa0d97a46a5f131b267c442bd1fc27c085f03878ba83833c5fb8f7c8691b001 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91db24949569787a6742f1ab1c72e7155c190e0d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/trainer_state.json @@ -0,0 +1,2116 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 2934, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + }, + { + "epoch": 3.0129516019086573, + "grad_norm": 0.5530241131782532, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 2210 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.6642216444015503, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 2220 + }, + { + "epoch": 3.0402181322426722, + "grad_norm": 0.61470627784729, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 2230 + }, + { + "epoch": 3.0538513974096797, + "grad_norm": 0.8559566140174866, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2240 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.7015801668167114, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 2250 + }, + { + "epoch": 3.0811179277436946, + "grad_norm": 0.7226442694664001, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2260 + }, + { + "epoch": 3.094751192910702, + "grad_norm": 0.7560588717460632, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 2270 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 0.6216568946838379, + "learning_rate": 0.0002, + "loss": 1.4395, + "step": 2280 + }, + { + "epoch": 3.122017723244717, + "grad_norm": 0.6768500804901123, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 2290 + }, + { + "epoch": 3.1356509884117245, + "grad_norm": 0.7028762102127075, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 2300 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 0.6329697966575623, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 2310 + }, + { + "epoch": 3.1629175187457395, + "grad_norm": 0.6328264474868774, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 2320 + }, + { + "epoch": 3.176550783912747, + "grad_norm": 0.7573632001876831, + "learning_rate": 0.0002, + "loss": 1.3762, + "step": 2330 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.595740795135498, + "learning_rate": 0.0002, + "loss": 1.3553, + "step": 2340 + }, + { + "epoch": 3.2038173142467623, + "grad_norm": 0.7111806869506836, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2350 + }, + { + "epoch": 3.2174505794137698, + "grad_norm": 0.6328730583190918, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 2360 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.5860254168510437, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 2370 + }, + { + "epoch": 3.2447171097477847, + "grad_norm": 0.7387157082557678, + "learning_rate": 0.0002, + "loss": 1.4267, + "step": 2380 + }, + { + "epoch": 3.258350374914792, + "grad_norm": 0.6897673606872559, + "learning_rate": 0.0002, + "loss": 1.4837, + "step": 2390 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.7157699465751648, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 2400 + }, + { + "epoch": 3.285616905248807, + "grad_norm": 0.6422511339187622, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 2410 + }, + { + "epoch": 3.2992501704158146, + "grad_norm": 1.0481886863708496, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 2420 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.7050786018371582, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 2430 + }, + { + "epoch": 3.3265167007498295, + "grad_norm": 0.6090759038925171, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 2440 + }, + { + "epoch": 3.340149965916837, + "grad_norm": 0.6626465320587158, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 2450 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 0.6565486788749695, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 2460 + }, + { + "epoch": 3.367416496250852, + "grad_norm": 0.6449528932571411, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2470 + }, + { + "epoch": 3.3810497614178594, + "grad_norm": 0.7746227383613586, + "learning_rate": 0.0002, + "loss": 1.4773, + "step": 2480 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.7074846029281616, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 2490 + }, + { + "epoch": 3.4083162917518743, + "grad_norm": 0.6547690033912659, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 2500 + }, + { + "epoch": 3.4219495569188823, + "grad_norm": 0.784721314907074, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2510 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.7270277738571167, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 2520 + }, + { + "epoch": 3.449216087252897, + "grad_norm": 0.67588871717453, + "learning_rate": 0.0002, + "loss": 1.4354, + "step": 2530 + }, + { + "epoch": 3.4628493524199047, + "grad_norm": 0.6768023371696472, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2540 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.7026481628417969, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 2550 + }, + { + "epoch": 3.4901158827539196, + "grad_norm": 0.646075963973999, + "learning_rate": 0.0002, + "loss": 1.468, + "step": 2560 + }, + { + "epoch": 3.503749147920927, + "grad_norm": 0.6288973689079285, + "learning_rate": 0.0002, + "loss": 1.4058, + "step": 2570 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.6440825462341309, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 2580 + }, + { + "epoch": 3.531015678254942, + "grad_norm": 0.7074111700057983, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2590 + }, + { + "epoch": 3.5446489434219495, + "grad_norm": 0.7007562518119812, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2600 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6045376658439636, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 2610 + }, + { + "epoch": 3.5719154737559644, + "grad_norm": 0.9149952530860901, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 2620 + }, + { + "epoch": 3.585548738922972, + "grad_norm": 0.6490362882614136, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2630 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.6552226543426514, + "learning_rate": 0.0002, + "loss": 1.4107, + "step": 2640 + }, + { + "epoch": 3.612815269256987, + "grad_norm": 0.6541850566864014, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 2650 + }, + { + "epoch": 3.6264485344239947, + "grad_norm": 0.6500770449638367, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 2660 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.6345893740653992, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2670 + }, + { + "epoch": 3.6537150647580097, + "grad_norm": 0.6382275223731995, + "learning_rate": 0.0002, + "loss": 1.3634, + "step": 2680 + }, + { + "epoch": 3.667348329925017, + "grad_norm": 0.6738566160202026, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 2690 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.7446315288543701, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 2700 + }, + { + "epoch": 3.694614860259032, + "grad_norm": 0.6717571020126343, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 2710 + }, + { + "epoch": 3.7082481254260395, + "grad_norm": 0.667259693145752, + "learning_rate": 0.0002, + "loss": 1.4285, + "step": 2720 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.6808622479438782, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 2730 + }, + { + "epoch": 3.7355146557600545, + "grad_norm": 0.7254287004470825, + "learning_rate": 0.0002, + "loss": 1.4297, + "step": 2740 + }, + { + "epoch": 3.749147920927062, + "grad_norm": 0.6864007711410522, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 2750 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.7041361331939697, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 2760 + }, + { + "epoch": 3.776414451261077, + "grad_norm": 0.6559903025627136, + "learning_rate": 0.0002, + "loss": 1.4284, + "step": 2770 + }, + { + "epoch": 3.7900477164280844, + "grad_norm": 0.6602269411087036, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 2780 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.692611813545227, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 2790 + }, + { + "epoch": 3.8173142467620993, + "grad_norm": 0.7051475644111633, + "learning_rate": 0.0002, + "loss": 1.4065, + "step": 2800 + }, + { + "epoch": 3.830947511929107, + "grad_norm": 0.6685371398925781, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 2810 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.6706477403640747, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2820 + }, + { + "epoch": 3.858214042263122, + "grad_norm": 0.6671637296676636, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 2830 + }, + { + "epoch": 3.8718473074301296, + "grad_norm": 0.694092333316803, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 2840 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.7349600195884705, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 2850 + }, + { + "epoch": 3.8991138377641446, + "grad_norm": 0.6647971868515015, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 2860 + }, + { + "epoch": 3.912747102931152, + "grad_norm": 0.806656539440155, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2870 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.6008772850036621, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2880 + }, + { + "epoch": 3.940013633265167, + "grad_norm": 0.659227728843689, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2890 + }, + { + "epoch": 3.9536468984321744, + "grad_norm": 0.6357656717300415, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2900 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.6541687846183777, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2910 + }, + { + "epoch": 3.9809134287661894, + "grad_norm": 0.6090909838676453, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2920 + }, + { + "epoch": 3.994546693933197, + "grad_norm": 0.7198411822319031, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2930 + }, + { + "epoch": 4.0, + "eval_loss": 1.9278366565704346, + "eval_runtime": 53.6567, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 2934 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5070122967302144e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2934/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40154ff703d01ea7d5182e11e3373f7b82b8cca8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:886255eb323b08886d2d430ad04a712afc30b8023f25ea26a3c02c6f10a9ab72 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b0b9d4293b3f35982502da5170b7e23a15bebc5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5962bbcfb2896cdb8738e2c25f4ea65b58945ba973b1e5689d489e4b831f4e9 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..296581d1745771f65b84c9a7388e617bf35f3b70 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c8dadd8ca3b2ae8518245449e0bbab16bc3271b1ac2e131fec9530a85f362c +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..447bdf5a368652ae6eaf600dea6e5223f98e5503 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69dd9f4175df7ddd0bf47bdd257e171fe3c9ae00d7e681fb5b6267fdcaf2677c +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c357ce245dc9cafb9e6134e1580dc10b584e8a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/trainer_state.json @@ -0,0 +1,2635 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 4.999318336741649, + "eval_steps": 10, + "global_step": 3667, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + }, + { + "epoch": 3.0129516019086573, + "grad_norm": 0.5530241131782532, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 2210 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.6642216444015503, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 2220 + }, + { + "epoch": 3.0402181322426722, + "grad_norm": 0.61470627784729, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 2230 + }, + { + "epoch": 3.0538513974096797, + "grad_norm": 0.8559566140174866, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2240 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.7015801668167114, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 2250 + }, + { + "epoch": 3.0811179277436946, + "grad_norm": 0.7226442694664001, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2260 + }, + { + "epoch": 3.094751192910702, + "grad_norm": 0.7560588717460632, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 2270 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 0.6216568946838379, + "learning_rate": 0.0002, + "loss": 1.4395, + "step": 2280 + }, + { + "epoch": 3.122017723244717, + "grad_norm": 0.6768500804901123, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 2290 + }, + { + "epoch": 3.1356509884117245, + "grad_norm": 0.7028762102127075, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 2300 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 0.6329697966575623, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 2310 + }, + { + "epoch": 3.1629175187457395, + "grad_norm": 0.6328264474868774, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 2320 + }, + { + "epoch": 3.176550783912747, + "grad_norm": 0.7573632001876831, + "learning_rate": 0.0002, + "loss": 1.3762, + "step": 2330 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.595740795135498, + "learning_rate": 0.0002, + "loss": 1.3553, + "step": 2340 + }, + { + "epoch": 3.2038173142467623, + "grad_norm": 0.7111806869506836, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2350 + }, + { + "epoch": 3.2174505794137698, + "grad_norm": 0.6328730583190918, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 2360 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.5860254168510437, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 2370 + }, + { + "epoch": 3.2447171097477847, + "grad_norm": 0.7387157082557678, + "learning_rate": 0.0002, + "loss": 1.4267, + "step": 2380 + }, + { + "epoch": 3.258350374914792, + "grad_norm": 0.6897673606872559, + "learning_rate": 0.0002, + "loss": 1.4837, + "step": 2390 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.7157699465751648, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 2400 + }, + { + "epoch": 3.285616905248807, + "grad_norm": 0.6422511339187622, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 2410 + }, + { + "epoch": 3.2992501704158146, + "grad_norm": 1.0481886863708496, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 2420 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.7050786018371582, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 2430 + }, + { + "epoch": 3.3265167007498295, + "grad_norm": 0.6090759038925171, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 2440 + }, + { + "epoch": 3.340149965916837, + "grad_norm": 0.6626465320587158, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 2450 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 0.6565486788749695, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 2460 + }, + { + "epoch": 3.367416496250852, + "grad_norm": 0.6449528932571411, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2470 + }, + { + "epoch": 3.3810497614178594, + "grad_norm": 0.7746227383613586, + "learning_rate": 0.0002, + "loss": 1.4773, + "step": 2480 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.7074846029281616, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 2490 + }, + { + "epoch": 3.4083162917518743, + "grad_norm": 0.6547690033912659, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 2500 + }, + { + "epoch": 3.4219495569188823, + "grad_norm": 0.784721314907074, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2510 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.7270277738571167, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 2520 + }, + { + "epoch": 3.449216087252897, + "grad_norm": 0.67588871717453, + "learning_rate": 0.0002, + "loss": 1.4354, + "step": 2530 + }, + { + "epoch": 3.4628493524199047, + "grad_norm": 0.6768023371696472, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2540 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.7026481628417969, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 2550 + }, + { + "epoch": 3.4901158827539196, + "grad_norm": 0.646075963973999, + "learning_rate": 0.0002, + "loss": 1.468, + "step": 2560 + }, + { + "epoch": 3.503749147920927, + "grad_norm": 0.6288973689079285, + "learning_rate": 0.0002, + "loss": 1.4058, + "step": 2570 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.6440825462341309, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 2580 + }, + { + "epoch": 3.531015678254942, + "grad_norm": 0.7074111700057983, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2590 + }, + { + "epoch": 3.5446489434219495, + "grad_norm": 0.7007562518119812, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2600 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6045376658439636, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 2610 + }, + { + "epoch": 3.5719154737559644, + "grad_norm": 0.9149952530860901, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 2620 + }, + { + "epoch": 3.585548738922972, + "grad_norm": 0.6490362882614136, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2630 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.6552226543426514, + "learning_rate": 0.0002, + "loss": 1.4107, + "step": 2640 + }, + { + "epoch": 3.612815269256987, + "grad_norm": 0.6541850566864014, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 2650 + }, + { + "epoch": 3.6264485344239947, + "grad_norm": 0.6500770449638367, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 2660 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.6345893740653992, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2670 + }, + { + "epoch": 3.6537150647580097, + "grad_norm": 0.6382275223731995, + "learning_rate": 0.0002, + "loss": 1.3634, + "step": 2680 + }, + { + "epoch": 3.667348329925017, + "grad_norm": 0.6738566160202026, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 2690 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.7446315288543701, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 2700 + }, + { + "epoch": 3.694614860259032, + "grad_norm": 0.6717571020126343, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 2710 + }, + { + "epoch": 3.7082481254260395, + "grad_norm": 0.667259693145752, + "learning_rate": 0.0002, + "loss": 1.4285, + "step": 2720 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.6808622479438782, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 2730 + }, + { + "epoch": 3.7355146557600545, + "grad_norm": 0.7254287004470825, + "learning_rate": 0.0002, + "loss": 1.4297, + "step": 2740 + }, + { + "epoch": 3.749147920927062, + "grad_norm": 0.6864007711410522, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 2750 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.7041361331939697, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 2760 + }, + { + "epoch": 3.776414451261077, + "grad_norm": 0.6559903025627136, + "learning_rate": 0.0002, + "loss": 1.4284, + "step": 2770 + }, + { + "epoch": 3.7900477164280844, + "grad_norm": 0.6602269411087036, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 2780 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.692611813545227, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 2790 + }, + { + "epoch": 3.8173142467620993, + "grad_norm": 0.7051475644111633, + "learning_rate": 0.0002, + "loss": 1.4065, + "step": 2800 + }, + { + "epoch": 3.830947511929107, + "grad_norm": 0.6685371398925781, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 2810 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.6706477403640747, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2820 + }, + { + "epoch": 3.858214042263122, + "grad_norm": 0.6671637296676636, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 2830 + }, + { + "epoch": 3.8718473074301296, + "grad_norm": 0.694092333316803, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 2840 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.7349600195884705, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 2850 + }, + { + "epoch": 3.8991138377641446, + "grad_norm": 0.6647971868515015, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 2860 + }, + { + "epoch": 3.912747102931152, + "grad_norm": 0.806656539440155, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2870 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.6008772850036621, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2880 + }, + { + "epoch": 3.940013633265167, + "grad_norm": 0.659227728843689, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2890 + }, + { + "epoch": 3.9536468984321744, + "grad_norm": 0.6357656717300415, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2900 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.6541687846183777, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2910 + }, + { + "epoch": 3.9809134287661894, + "grad_norm": 0.6090909838676453, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2920 + }, + { + "epoch": 3.994546693933197, + "grad_norm": 0.7198411822319031, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2930 + }, + { + "epoch": 4.0, + "eval_loss": 1.9278366565704346, + "eval_runtime": 53.6567, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 2934 + }, + { + "epoch": 4.008179959100205, + "grad_norm": 0.6498575210571289, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 2940 + }, + { + "epoch": 4.021813224267212, + "grad_norm": 0.865602433681488, + "learning_rate": 0.0002, + "loss": 1.2075, + "step": 2950 + }, + { + "epoch": 4.03544648943422, + "grad_norm": 0.8514999151229858, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 2960 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 1.0677322149276733, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 2970 + }, + { + "epoch": 4.062713019768235, + "grad_norm": 1.0126488208770752, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2980 + }, + { + "epoch": 4.076346284935242, + "grad_norm": 1.0008870363235474, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 2990 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.7942054271697998, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 3000 + }, + { + "epoch": 4.103612815269257, + "grad_norm": 1.0482100248336792, + "learning_rate": 0.0002, + "loss": 1.214, + "step": 3010 + }, + { + "epoch": 4.1172460804362645, + "grad_norm": 1.0516992807388306, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 3020 + }, + { + "epoch": 4.130879345603272, + "grad_norm": 0.8144322037696838, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3030 + }, + { + "epoch": 4.144512610770279, + "grad_norm": 0.952297568321228, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 3040 + }, + { + "epoch": 4.158145875937287, + "grad_norm": 1.007645606994629, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 3050 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 1.0480353832244873, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 3060 + }, + { + "epoch": 4.185412406271302, + "grad_norm": 0.9270663857460022, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 3070 + }, + { + "epoch": 4.199045671438309, + "grad_norm": 1.3415262699127197, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 3080 + }, + { + "epoch": 4.212678936605317, + "grad_norm": 1.167606234550476, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 3090 + }, + { + "epoch": 4.226312201772324, + "grad_norm": 0.9418690800666809, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 3100 + }, + { + "epoch": 4.239945466939332, + "grad_norm": 1.0885876417160034, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 3110 + }, + { + "epoch": 4.253578732106339, + "grad_norm": 0.9165483713150024, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 3120 + }, + { + "epoch": 4.267211997273347, + "grad_norm": 0.9154694080352783, + "learning_rate": 0.0002, + "loss": 1.2933, + "step": 3130 + }, + { + "epoch": 4.280845262440354, + "grad_norm": 1.100580096244812, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3140 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.9367576241493225, + "learning_rate": 0.0002, + "loss": 1.251, + "step": 3150 + }, + { + "epoch": 4.308111792774369, + "grad_norm": 0.9744015336036682, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3160 + }, + { + "epoch": 4.321745057941377, + "grad_norm": 0.9865175485610962, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 3170 + }, + { + "epoch": 4.335378323108385, + "grad_norm": 1.0124907493591309, + "learning_rate": 0.0002, + "loss": 1.2161, + "step": 3180 + }, + { + "epoch": 4.349011588275392, + "grad_norm": 1.1044819355010986, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 3190 + }, + { + "epoch": 4.3626448534424, + "grad_norm": 0.9305577278137207, + "learning_rate": 0.0002, + "loss": 1.2483, + "step": 3200 + }, + { + "epoch": 4.376278118609407, + "grad_norm": 0.969265341758728, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3210 + }, + { + "epoch": 4.389911383776415, + "grad_norm": 1.0671923160552979, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 3220 + }, + { + "epoch": 4.403544648943422, + "grad_norm": 0.9440539479255676, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 3230 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.9824562668800354, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 3240 + }, + { + "epoch": 4.430811179277437, + "grad_norm": 1.0245535373687744, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 3250 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.9629312753677368, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 3260 + }, + { + "epoch": 4.458077709611452, + "grad_norm": 1.1556470394134521, + "learning_rate": 0.0002, + "loss": 1.2689, + "step": 3270 + }, + { + "epoch": 4.47171097477846, + "grad_norm": 0.9796679019927979, + "learning_rate": 0.0002, + "loss": 1.2214, + "step": 3280 + }, + { + "epoch": 4.485344239945467, + "grad_norm": 0.9030535221099854, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3290 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.9142820835113525, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 3300 + }, + { + "epoch": 4.5126107702794815, + "grad_norm": 0.966867208480835, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 3310 + }, + { + "epoch": 4.5262440354464895, + "grad_norm": 1.0127079486846924, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 3320 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 1.055506706237793, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 3330 + }, + { + "epoch": 4.553510565780504, + "grad_norm": 0.9831468462944031, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3340 + }, + { + "epoch": 4.567143830947512, + "grad_norm": 0.9304661154747009, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 3350 + }, + { + "epoch": 4.580777096114519, + "grad_norm": 0.9369107484817505, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3360 + }, + { + "epoch": 4.594410361281527, + "grad_norm": 1.009506344795227, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3370 + }, + { + "epoch": 4.608043626448534, + "grad_norm": 1.0575741529464722, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 3380 + }, + { + "epoch": 4.621676891615542, + "grad_norm": 0.9102860689163208, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 3390 + }, + { + "epoch": 4.635310156782549, + "grad_norm": 0.8111315965652466, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 3400 + }, + { + "epoch": 4.648943421949557, + "grad_norm": 0.9459649920463562, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3410 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.9709545969963074, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 3420 + }, + { + "epoch": 4.676209952283572, + "grad_norm": 0.9909247159957886, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3430 + }, + { + "epoch": 4.689843217450579, + "grad_norm": 0.9094610810279846, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 3440 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.9012220501899719, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3450 + }, + { + "epoch": 4.717109747784594, + "grad_norm": 0.8669242858886719, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 3460 + }, + { + "epoch": 4.730743012951602, + "grad_norm": 0.9753699898719788, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 3470 + }, + { + "epoch": 4.74437627811861, + "grad_norm": 1.0252684354782104, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3480 + }, + { + "epoch": 4.758009543285617, + "grad_norm": 1.208098292350769, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 3490 + }, + { + "epoch": 4.771642808452625, + "grad_norm": 0.8632914423942566, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3500 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 1.0084818601608276, + "learning_rate": 0.0002, + "loss": 1.3062, + "step": 3510 + }, + { + "epoch": 4.79890933878664, + "grad_norm": 0.9095172882080078, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 3520 + }, + { + "epoch": 4.812542603953647, + "grad_norm": 0.9740135669708252, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3530 + }, + { + "epoch": 4.826175869120655, + "grad_norm": 0.8862348794937134, + "learning_rate": 0.0002, + "loss": 1.2816, + "step": 3540 + }, + { + "epoch": 4.839809134287662, + "grad_norm": 1.0761774778366089, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 3550 + }, + { + "epoch": 4.85344239945467, + "grad_norm": 1.0134117603302002, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 3560 + }, + { + "epoch": 4.867075664621677, + "grad_norm": 0.9262851476669312, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 3570 + }, + { + "epoch": 4.8807089297886845, + "grad_norm": 0.9518504738807678, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 3580 + }, + { + "epoch": 4.894342194955692, + "grad_norm": 1.10103178024292, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 3590 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 1.0133225917816162, + "learning_rate": 0.0002, + "loss": 1.2592, + "step": 3600 + }, + { + "epoch": 4.9216087252897065, + "grad_norm": 0.9637737274169922, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 3610 + }, + { + "epoch": 4.935241990456714, + "grad_norm": 0.9800633192062378, + "learning_rate": 0.0002, + "loss": 1.2991, + "step": 3620 + }, + { + "epoch": 4.948875255623722, + "grad_norm": 1.0065973997116089, + "learning_rate": 0.0002, + "loss": 1.2872, + "step": 3630 + }, + { + "epoch": 4.962508520790729, + "grad_norm": 0.9354690313339233, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 3640 + }, + { + "epoch": 4.976141785957737, + "grad_norm": 0.9744119048118591, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 3650 + }, + { + "epoch": 4.989775051124744, + "grad_norm": 0.9357708096504211, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 3660 + }, + { + "epoch": 4.999318336741649, + "eval_loss": 2.0763096809387207, + "eval_runtime": 53.6578, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 3667 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.883765370912768e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3667/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2188e1bdc23b4ed3e71be03659b9c96893cb7646 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a790c1906953e4a115bc5678c3e6141b2eebc3df04b7b63d1093e0cbc9e53150 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a599a749dfa2d7f5a2508070ecde9baeb3db6ff --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36dffa0d48892f3e6cd812f44538d1d4e9159c8ca81e9e662b3f10af1bb8986 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d1c459a2a19a9211d1c0eb29abddb21d17ec9b1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb4c8acbbdaeb646518182574e28dbbef8acd1ddffaf25ac2b857469af8a5bb2 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5590ace99e6075f0ae001c1026ccfa7ccbf440b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc75e6ddf6a19eaa52d203c945b4a84e1dd59632e32701773cae1cc55b1ec8f +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7c388f3b5327c261ca592f48bd6f8703e83c1c1b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/trainer_state.json @@ -0,0 +1,3161 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4401, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + }, + { + "epoch": 3.0129516019086573, + "grad_norm": 0.5530241131782532, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 2210 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.6642216444015503, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 2220 + }, + { + "epoch": 3.0402181322426722, + "grad_norm": 0.61470627784729, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 2230 + }, + { + "epoch": 3.0538513974096797, + "grad_norm": 0.8559566140174866, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2240 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.7015801668167114, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 2250 + }, + { + "epoch": 3.0811179277436946, + "grad_norm": 0.7226442694664001, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2260 + }, + { + "epoch": 3.094751192910702, + "grad_norm": 0.7560588717460632, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 2270 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 0.6216568946838379, + "learning_rate": 0.0002, + "loss": 1.4395, + "step": 2280 + }, + { + "epoch": 3.122017723244717, + "grad_norm": 0.6768500804901123, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 2290 + }, + { + "epoch": 3.1356509884117245, + "grad_norm": 0.7028762102127075, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 2300 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 0.6329697966575623, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 2310 + }, + { + "epoch": 3.1629175187457395, + "grad_norm": 0.6328264474868774, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 2320 + }, + { + "epoch": 3.176550783912747, + "grad_norm": 0.7573632001876831, + "learning_rate": 0.0002, + "loss": 1.3762, + "step": 2330 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.595740795135498, + "learning_rate": 0.0002, + "loss": 1.3553, + "step": 2340 + }, + { + "epoch": 3.2038173142467623, + "grad_norm": 0.7111806869506836, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2350 + }, + { + "epoch": 3.2174505794137698, + "grad_norm": 0.6328730583190918, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 2360 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.5860254168510437, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 2370 + }, + { + "epoch": 3.2447171097477847, + "grad_norm": 0.7387157082557678, + "learning_rate": 0.0002, + "loss": 1.4267, + "step": 2380 + }, + { + "epoch": 3.258350374914792, + "grad_norm": 0.6897673606872559, + "learning_rate": 0.0002, + "loss": 1.4837, + "step": 2390 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.7157699465751648, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 2400 + }, + { + "epoch": 3.285616905248807, + "grad_norm": 0.6422511339187622, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 2410 + }, + { + "epoch": 3.2992501704158146, + "grad_norm": 1.0481886863708496, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 2420 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.7050786018371582, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 2430 + }, + { + "epoch": 3.3265167007498295, + "grad_norm": 0.6090759038925171, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 2440 + }, + { + "epoch": 3.340149965916837, + "grad_norm": 0.6626465320587158, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 2450 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 0.6565486788749695, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 2460 + }, + { + "epoch": 3.367416496250852, + "grad_norm": 0.6449528932571411, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2470 + }, + { + "epoch": 3.3810497614178594, + "grad_norm": 0.7746227383613586, + "learning_rate": 0.0002, + "loss": 1.4773, + "step": 2480 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.7074846029281616, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 2490 + }, + { + "epoch": 3.4083162917518743, + "grad_norm": 0.6547690033912659, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 2500 + }, + { + "epoch": 3.4219495569188823, + "grad_norm": 0.784721314907074, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2510 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.7270277738571167, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 2520 + }, + { + "epoch": 3.449216087252897, + "grad_norm": 0.67588871717453, + "learning_rate": 0.0002, + "loss": 1.4354, + "step": 2530 + }, + { + "epoch": 3.4628493524199047, + "grad_norm": 0.6768023371696472, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2540 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.7026481628417969, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 2550 + }, + { + "epoch": 3.4901158827539196, + "grad_norm": 0.646075963973999, + "learning_rate": 0.0002, + "loss": 1.468, + "step": 2560 + }, + { + "epoch": 3.503749147920927, + "grad_norm": 0.6288973689079285, + "learning_rate": 0.0002, + "loss": 1.4058, + "step": 2570 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.6440825462341309, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 2580 + }, + { + "epoch": 3.531015678254942, + "grad_norm": 0.7074111700057983, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2590 + }, + { + "epoch": 3.5446489434219495, + "grad_norm": 0.7007562518119812, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2600 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6045376658439636, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 2610 + }, + { + "epoch": 3.5719154737559644, + "grad_norm": 0.9149952530860901, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 2620 + }, + { + "epoch": 3.585548738922972, + "grad_norm": 0.6490362882614136, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2630 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.6552226543426514, + "learning_rate": 0.0002, + "loss": 1.4107, + "step": 2640 + }, + { + "epoch": 3.612815269256987, + "grad_norm": 0.6541850566864014, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 2650 + }, + { + "epoch": 3.6264485344239947, + "grad_norm": 0.6500770449638367, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 2660 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.6345893740653992, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2670 + }, + { + "epoch": 3.6537150647580097, + "grad_norm": 0.6382275223731995, + "learning_rate": 0.0002, + "loss": 1.3634, + "step": 2680 + }, + { + "epoch": 3.667348329925017, + "grad_norm": 0.6738566160202026, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 2690 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.7446315288543701, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 2700 + }, + { + "epoch": 3.694614860259032, + "grad_norm": 0.6717571020126343, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 2710 + }, + { + "epoch": 3.7082481254260395, + "grad_norm": 0.667259693145752, + "learning_rate": 0.0002, + "loss": 1.4285, + "step": 2720 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.6808622479438782, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 2730 + }, + { + "epoch": 3.7355146557600545, + "grad_norm": 0.7254287004470825, + "learning_rate": 0.0002, + "loss": 1.4297, + "step": 2740 + }, + { + "epoch": 3.749147920927062, + "grad_norm": 0.6864007711410522, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 2750 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.7041361331939697, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 2760 + }, + { + "epoch": 3.776414451261077, + "grad_norm": 0.6559903025627136, + "learning_rate": 0.0002, + "loss": 1.4284, + "step": 2770 + }, + { + "epoch": 3.7900477164280844, + "grad_norm": 0.6602269411087036, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 2780 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.692611813545227, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 2790 + }, + { + "epoch": 3.8173142467620993, + "grad_norm": 0.7051475644111633, + "learning_rate": 0.0002, + "loss": 1.4065, + "step": 2800 + }, + { + "epoch": 3.830947511929107, + "grad_norm": 0.6685371398925781, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 2810 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.6706477403640747, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2820 + }, + { + "epoch": 3.858214042263122, + "grad_norm": 0.6671637296676636, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 2830 + }, + { + "epoch": 3.8718473074301296, + "grad_norm": 0.694092333316803, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 2840 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.7349600195884705, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 2850 + }, + { + "epoch": 3.8991138377641446, + "grad_norm": 0.6647971868515015, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 2860 + }, + { + "epoch": 3.912747102931152, + "grad_norm": 0.806656539440155, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2870 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.6008772850036621, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2880 + }, + { + "epoch": 3.940013633265167, + "grad_norm": 0.659227728843689, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2890 + }, + { + "epoch": 3.9536468984321744, + "grad_norm": 0.6357656717300415, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2900 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.6541687846183777, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2910 + }, + { + "epoch": 3.9809134287661894, + "grad_norm": 0.6090909838676453, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2920 + }, + { + "epoch": 3.994546693933197, + "grad_norm": 0.7198411822319031, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2930 + }, + { + "epoch": 4.0, + "eval_loss": 1.9278366565704346, + "eval_runtime": 53.6567, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 2934 + }, + { + "epoch": 4.008179959100205, + "grad_norm": 0.6498575210571289, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 2940 + }, + { + "epoch": 4.021813224267212, + "grad_norm": 0.865602433681488, + "learning_rate": 0.0002, + "loss": 1.2075, + "step": 2950 + }, + { + "epoch": 4.03544648943422, + "grad_norm": 0.8514999151229858, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 2960 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 1.0677322149276733, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 2970 + }, + { + "epoch": 4.062713019768235, + "grad_norm": 1.0126488208770752, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2980 + }, + { + "epoch": 4.076346284935242, + "grad_norm": 1.0008870363235474, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 2990 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.7942054271697998, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 3000 + }, + { + "epoch": 4.103612815269257, + "grad_norm": 1.0482100248336792, + "learning_rate": 0.0002, + "loss": 1.214, + "step": 3010 + }, + { + "epoch": 4.1172460804362645, + "grad_norm": 1.0516992807388306, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 3020 + }, + { + "epoch": 4.130879345603272, + "grad_norm": 0.8144322037696838, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3030 + }, + { + "epoch": 4.144512610770279, + "grad_norm": 0.952297568321228, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 3040 + }, + { + "epoch": 4.158145875937287, + "grad_norm": 1.007645606994629, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 3050 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 1.0480353832244873, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 3060 + }, + { + "epoch": 4.185412406271302, + "grad_norm": 0.9270663857460022, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 3070 + }, + { + "epoch": 4.199045671438309, + "grad_norm": 1.3415262699127197, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 3080 + }, + { + "epoch": 4.212678936605317, + "grad_norm": 1.167606234550476, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 3090 + }, + { + "epoch": 4.226312201772324, + "grad_norm": 0.9418690800666809, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 3100 + }, + { + "epoch": 4.239945466939332, + "grad_norm": 1.0885876417160034, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 3110 + }, + { + "epoch": 4.253578732106339, + "grad_norm": 0.9165483713150024, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 3120 + }, + { + "epoch": 4.267211997273347, + "grad_norm": 0.9154694080352783, + "learning_rate": 0.0002, + "loss": 1.2933, + "step": 3130 + }, + { + "epoch": 4.280845262440354, + "grad_norm": 1.100580096244812, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3140 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.9367576241493225, + "learning_rate": 0.0002, + "loss": 1.251, + "step": 3150 + }, + { + "epoch": 4.308111792774369, + "grad_norm": 0.9744015336036682, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3160 + }, + { + "epoch": 4.321745057941377, + "grad_norm": 0.9865175485610962, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 3170 + }, + { + "epoch": 4.335378323108385, + "grad_norm": 1.0124907493591309, + "learning_rate": 0.0002, + "loss": 1.2161, + "step": 3180 + }, + { + "epoch": 4.349011588275392, + "grad_norm": 1.1044819355010986, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 3190 + }, + { + "epoch": 4.3626448534424, + "grad_norm": 0.9305577278137207, + "learning_rate": 0.0002, + "loss": 1.2483, + "step": 3200 + }, + { + "epoch": 4.376278118609407, + "grad_norm": 0.969265341758728, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3210 + }, + { + "epoch": 4.389911383776415, + "grad_norm": 1.0671923160552979, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 3220 + }, + { + "epoch": 4.403544648943422, + "grad_norm": 0.9440539479255676, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 3230 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.9824562668800354, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 3240 + }, + { + "epoch": 4.430811179277437, + "grad_norm": 1.0245535373687744, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 3250 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.9629312753677368, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 3260 + }, + { + "epoch": 4.458077709611452, + "grad_norm": 1.1556470394134521, + "learning_rate": 0.0002, + "loss": 1.2689, + "step": 3270 + }, + { + "epoch": 4.47171097477846, + "grad_norm": 0.9796679019927979, + "learning_rate": 0.0002, + "loss": 1.2214, + "step": 3280 + }, + { + "epoch": 4.485344239945467, + "grad_norm": 0.9030535221099854, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3290 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.9142820835113525, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 3300 + }, + { + "epoch": 4.5126107702794815, + "grad_norm": 0.966867208480835, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 3310 + }, + { + "epoch": 4.5262440354464895, + "grad_norm": 1.0127079486846924, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 3320 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 1.055506706237793, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 3330 + }, + { + "epoch": 4.553510565780504, + "grad_norm": 0.9831468462944031, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3340 + }, + { + "epoch": 4.567143830947512, + "grad_norm": 0.9304661154747009, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 3350 + }, + { + "epoch": 4.580777096114519, + "grad_norm": 0.9369107484817505, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3360 + }, + { + "epoch": 4.594410361281527, + "grad_norm": 1.009506344795227, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3370 + }, + { + "epoch": 4.608043626448534, + "grad_norm": 1.0575741529464722, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 3380 + }, + { + "epoch": 4.621676891615542, + "grad_norm": 0.9102860689163208, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 3390 + }, + { + "epoch": 4.635310156782549, + "grad_norm": 0.8111315965652466, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 3400 + }, + { + "epoch": 4.648943421949557, + "grad_norm": 0.9459649920463562, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3410 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.9709545969963074, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 3420 + }, + { + "epoch": 4.676209952283572, + "grad_norm": 0.9909247159957886, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3430 + }, + { + "epoch": 4.689843217450579, + "grad_norm": 0.9094610810279846, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 3440 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.9012220501899719, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3450 + }, + { + "epoch": 4.717109747784594, + "grad_norm": 0.8669242858886719, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 3460 + }, + { + "epoch": 4.730743012951602, + "grad_norm": 0.9753699898719788, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 3470 + }, + { + "epoch": 4.74437627811861, + "grad_norm": 1.0252684354782104, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3480 + }, + { + "epoch": 4.758009543285617, + "grad_norm": 1.208098292350769, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 3490 + }, + { + "epoch": 4.771642808452625, + "grad_norm": 0.8632914423942566, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3500 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 1.0084818601608276, + "learning_rate": 0.0002, + "loss": 1.3062, + "step": 3510 + }, + { + "epoch": 4.79890933878664, + "grad_norm": 0.9095172882080078, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 3520 + }, + { + "epoch": 4.812542603953647, + "grad_norm": 0.9740135669708252, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3530 + }, + { + "epoch": 4.826175869120655, + "grad_norm": 0.8862348794937134, + "learning_rate": 0.0002, + "loss": 1.2816, + "step": 3540 + }, + { + "epoch": 4.839809134287662, + "grad_norm": 1.0761774778366089, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 3550 + }, + { + "epoch": 4.85344239945467, + "grad_norm": 1.0134117603302002, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 3560 + }, + { + "epoch": 4.867075664621677, + "grad_norm": 0.9262851476669312, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 3570 + }, + { + "epoch": 4.8807089297886845, + "grad_norm": 0.9518504738807678, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 3580 + }, + { + "epoch": 4.894342194955692, + "grad_norm": 1.10103178024292, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 3590 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 1.0133225917816162, + "learning_rate": 0.0002, + "loss": 1.2592, + "step": 3600 + }, + { + "epoch": 4.9216087252897065, + "grad_norm": 0.9637737274169922, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 3610 + }, + { + "epoch": 4.935241990456714, + "grad_norm": 0.9800633192062378, + "learning_rate": 0.0002, + "loss": 1.2991, + "step": 3620 + }, + { + "epoch": 4.948875255623722, + "grad_norm": 1.0065973997116089, + "learning_rate": 0.0002, + "loss": 1.2872, + "step": 3630 + }, + { + "epoch": 4.962508520790729, + "grad_norm": 0.9354690313339233, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 3640 + }, + { + "epoch": 4.976141785957737, + "grad_norm": 0.9744119048118591, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 3650 + }, + { + "epoch": 4.989775051124744, + "grad_norm": 0.9357708096504211, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 3660 + }, + { + "epoch": 4.999318336741649, + "eval_loss": 2.0763096809387207, + "eval_runtime": 53.6578, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 3667 + }, + { + "epoch": 5.003408316291752, + "grad_norm": 1.3171669244766235, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3670 + }, + { + "epoch": 5.017041581458759, + "grad_norm": 1.4427374601364136, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 3680 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 0.9313797354698181, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 3690 + }, + { + "epoch": 5.044308111792774, + "grad_norm": 1.417641282081604, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 3700 + }, + { + "epoch": 5.057941376959782, + "grad_norm": 1.097440242767334, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 3710 + }, + { + "epoch": 5.071574642126789, + "grad_norm": 1.4277986288070679, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 3720 + }, + { + "epoch": 5.085207907293797, + "grad_norm": 1.2520873546600342, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 3730 + }, + { + "epoch": 5.098841172460804, + "grad_norm": 1.39503812789917, + "learning_rate": 0.0002, + "loss": 0.9531, + "step": 3740 + }, + { + "epoch": 5.112474437627812, + "grad_norm": 1.2345329523086548, + "learning_rate": 0.0002, + "loss": 0.9658, + "step": 3750 + }, + { + "epoch": 5.126107702794819, + "grad_norm": 1.2700239419937134, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 3760 + }, + { + "epoch": 5.139740967961827, + "grad_norm": 1.5343066453933716, + "learning_rate": 0.0002, + "loss": 0.993, + "step": 3770 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 1.4191608428955078, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 3780 + }, + { + "epoch": 5.167007498295842, + "grad_norm": 1.4591023921966553, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 3790 + }, + { + "epoch": 5.18064076346285, + "grad_norm": 1.6158121824264526, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 3800 + }, + { + "epoch": 5.194274028629857, + "grad_norm": 1.6077582836151123, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 3810 + }, + { + "epoch": 5.207907293796865, + "grad_norm": 1.2815653085708618, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 3820 + }, + { + "epoch": 5.221540558963872, + "grad_norm": 1.2427219152450562, + "learning_rate": 0.0002, + "loss": 1.0131, + "step": 3830 + }, + { + "epoch": 5.23517382413088, + "grad_norm": 1.3013232946395874, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 3840 + }, + { + "epoch": 5.248807089297887, + "grad_norm": 1.4643588066101074, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 3850 + }, + { + "epoch": 5.2624403544648946, + "grad_norm": 1.2571916580200195, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 3860 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 1.226682186126709, + "learning_rate": 0.0002, + "loss": 0.9686, + "step": 3870 + }, + { + "epoch": 5.2897068847989095, + "grad_norm": 1.2541271448135376, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 3880 + }, + { + "epoch": 5.3033401499659165, + "grad_norm": 1.2340261936187744, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 3890 + }, + { + "epoch": 5.316973415132924, + "grad_norm": 1.345527172088623, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 3900 + }, + { + "epoch": 5.3306066802999315, + "grad_norm": 1.2128909826278687, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 3910 + }, + { + "epoch": 5.344239945466939, + "grad_norm": 1.3052637577056885, + "learning_rate": 0.0002, + "loss": 1.0002, + "step": 3920 + }, + { + "epoch": 5.357873210633947, + "grad_norm": 1.1017392873764038, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 3930 + }, + { + "epoch": 5.371506475800954, + "grad_norm": 1.26950204372406, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 3940 + }, + { + "epoch": 5.385139740967962, + "grad_norm": 1.3372546434402466, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 3950 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 1.3115156888961792, + "learning_rate": 0.0002, + "loss": 1.0529, + "step": 3960 + }, + { + "epoch": 5.412406271301977, + "grad_norm": 1.3511474132537842, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 3970 + }, + { + "epoch": 5.426039536468984, + "grad_norm": 1.1001893281936646, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 3980 + }, + { + "epoch": 5.439672801635992, + "grad_norm": 1.2810745239257812, + "learning_rate": 0.0002, + "loss": 1.0855, + "step": 3990 + }, + { + "epoch": 5.453306066802999, + "grad_norm": 1.2999306917190552, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 4000 + }, + { + "epoch": 5.466939331970007, + "grad_norm": 1.172553300857544, + "learning_rate": 0.0002, + "loss": 1.0073, + "step": 4010 + }, + { + "epoch": 5.480572597137014, + "grad_norm": 1.1483557224273682, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 4020 + }, + { + "epoch": 5.494205862304022, + "grad_norm": 1.4148036241531372, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 4030 + }, + { + "epoch": 5.507839127471029, + "grad_norm": 1.1611121892929077, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 4040 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 1.3837119340896606, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 4050 + }, + { + "epoch": 5.535105657805044, + "grad_norm": 1.3025696277618408, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4060 + }, + { + "epoch": 5.548738922972052, + "grad_norm": 1.348091959953308, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 4070 + }, + { + "epoch": 5.56237218813906, + "grad_norm": 1.3463449478149414, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 4080 + }, + { + "epoch": 5.576005453306067, + "grad_norm": 1.3904176950454712, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 4090 + }, + { + "epoch": 5.589638718473074, + "grad_norm": 1.2737950086593628, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 4100 + }, + { + "epoch": 5.603271983640082, + "grad_norm": 1.3311827182769775, + "learning_rate": 0.0002, + "loss": 1.0441, + "step": 4110 + }, + { + "epoch": 5.61690524880709, + "grad_norm": 1.24485182762146, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 4120 + }, + { + "epoch": 5.630538513974097, + "grad_norm": 1.2724957466125488, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 4130 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 1.3439847230911255, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4140 + }, + { + "epoch": 5.657805044308112, + "grad_norm": 1.372359037399292, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4150 + }, + { + "epoch": 5.6714383094751195, + "grad_norm": 1.2322949171066284, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 4160 + }, + { + "epoch": 5.6850715746421265, + "grad_norm": 1.4859193563461304, + "learning_rate": 0.0002, + "loss": 1.0465, + "step": 4170 + }, + { + "epoch": 5.6987048398091344, + "grad_norm": 1.4318448305130005, + "learning_rate": 0.0002, + "loss": 1.1569, + "step": 4180 + }, + { + "epoch": 5.7123381049761415, + "grad_norm": 1.1533565521240234, + "learning_rate": 0.0002, + "loss": 1.017, + "step": 4190 + }, + { + "epoch": 5.725971370143149, + "grad_norm": 1.3009696006774902, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 4200 + }, + { + "epoch": 5.739604635310156, + "grad_norm": 1.3972162008285522, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 4210 + }, + { + "epoch": 5.753237900477164, + "grad_norm": 1.2142186164855957, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 4220 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 1.401191234588623, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4230 + }, + { + "epoch": 5.780504430811179, + "grad_norm": 1.4124404191970825, + "learning_rate": 0.0002, + "loss": 1.0722, + "step": 4240 + }, + { + "epoch": 5.794137695978186, + "grad_norm": 1.3488332033157349, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 4250 + }, + { + "epoch": 5.807770961145194, + "grad_norm": 1.3671752214431763, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 4260 + }, + { + "epoch": 5.821404226312202, + "grad_norm": 1.2608201503753662, + "learning_rate": 0.0002, + "loss": 1.1294, + "step": 4270 + }, + { + "epoch": 5.835037491479209, + "grad_norm": 1.1814045906066895, + "learning_rate": 0.0002, + "loss": 1.1216, + "step": 4280 + }, + { + "epoch": 5.848670756646217, + "grad_norm": 1.4139586687088013, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 4290 + }, + { + "epoch": 5.862304021813224, + "grad_norm": 1.34248948097229, + "learning_rate": 0.0002, + "loss": 1.0656, + "step": 4300 + }, + { + "epoch": 5.875937286980232, + "grad_norm": 1.1428139209747314, + "learning_rate": 0.0002, + "loss": 1.0791, + "step": 4310 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 1.1941087245941162, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 4320 + }, + { + "epoch": 5.903203817314247, + "grad_norm": 1.2374001741409302, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 4330 + }, + { + "epoch": 5.916837082481254, + "grad_norm": 1.4314988851547241, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 4340 + }, + { + "epoch": 5.930470347648262, + "grad_norm": 1.1286126375198364, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 4350 + }, + { + "epoch": 5.944103612815269, + "grad_norm": 1.25884211063385, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 4360 + }, + { + "epoch": 5.957736877982277, + "grad_norm": 1.223357915878296, + "learning_rate": 0.0002, + "loss": 1.1189, + "step": 4370 + }, + { + "epoch": 5.971370143149285, + "grad_norm": 1.2173810005187988, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 4380 + }, + { + "epoch": 5.985003408316292, + "grad_norm": 1.3152292966842651, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 4390 + }, + { + "epoch": 5.998636673483299, + "grad_norm": 1.5576739311218262, + "learning_rate": 0.0002, + "loss": 1.1456, + "step": 4400 + }, + { + "epoch": 6.0, + "eval_loss": 2.3435311317443848, + "eval_runtime": 53.6362, + "eval_samples_per_second": 9.453, + "eval_steps_per_second": 1.193, + "step": 4401 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2605184450953216e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4401/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6aac1a12f8d73d9bff6a56f35fc4e01fdb8b3ef --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bbfe19b1249c8fd39eec19852e8daef1c5a4b412213ec445004bdf81ddd97bc +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..98066215c4dbcc8c3b7827e83a729859e22370fa --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d01c52a8dd45b65ef67b88921f890a78e3db81781efd189bb78ae27306d98d1c +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..19b6052a9c95fdca4735766d87987424ca007bb6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e4a0c08105fcd5051852976ad1b66b5b6ee2938a154961e7953cc474b6cbbfd +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8787355ac5166e0c748423c41345c39963d75b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac04a313bf199901f4905a3dcb49ac1edffce136158830db1eeef05a7184287 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e623208074811bd4909907a5384b39f3585a815d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/trainer_state.json @@ -0,0 +1,3680 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 6.999318336741649, + "eval_steps": 10, + "global_step": 5134, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + }, + { + "epoch": 3.0129516019086573, + "grad_norm": 0.5530241131782532, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 2210 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.6642216444015503, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 2220 + }, + { + "epoch": 3.0402181322426722, + "grad_norm": 0.61470627784729, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 2230 + }, + { + "epoch": 3.0538513974096797, + "grad_norm": 0.8559566140174866, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2240 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.7015801668167114, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 2250 + }, + { + "epoch": 3.0811179277436946, + "grad_norm": 0.7226442694664001, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2260 + }, + { + "epoch": 3.094751192910702, + "grad_norm": 0.7560588717460632, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 2270 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 0.6216568946838379, + "learning_rate": 0.0002, + "loss": 1.4395, + "step": 2280 + }, + { + "epoch": 3.122017723244717, + "grad_norm": 0.6768500804901123, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 2290 + }, + { + "epoch": 3.1356509884117245, + "grad_norm": 0.7028762102127075, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 2300 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 0.6329697966575623, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 2310 + }, + { + "epoch": 3.1629175187457395, + "grad_norm": 0.6328264474868774, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 2320 + }, + { + "epoch": 3.176550783912747, + "grad_norm": 0.7573632001876831, + "learning_rate": 0.0002, + "loss": 1.3762, + "step": 2330 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.595740795135498, + "learning_rate": 0.0002, + "loss": 1.3553, + "step": 2340 + }, + { + "epoch": 3.2038173142467623, + "grad_norm": 0.7111806869506836, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2350 + }, + { + "epoch": 3.2174505794137698, + "grad_norm": 0.6328730583190918, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 2360 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.5860254168510437, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 2370 + }, + { + "epoch": 3.2447171097477847, + "grad_norm": 0.7387157082557678, + "learning_rate": 0.0002, + "loss": 1.4267, + "step": 2380 + }, + { + "epoch": 3.258350374914792, + "grad_norm": 0.6897673606872559, + "learning_rate": 0.0002, + "loss": 1.4837, + "step": 2390 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.7157699465751648, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 2400 + }, + { + "epoch": 3.285616905248807, + "grad_norm": 0.6422511339187622, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 2410 + }, + { + "epoch": 3.2992501704158146, + "grad_norm": 1.0481886863708496, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 2420 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.7050786018371582, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 2430 + }, + { + "epoch": 3.3265167007498295, + "grad_norm": 0.6090759038925171, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 2440 + }, + { + "epoch": 3.340149965916837, + "grad_norm": 0.6626465320587158, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 2450 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 0.6565486788749695, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 2460 + }, + { + "epoch": 3.367416496250852, + "grad_norm": 0.6449528932571411, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2470 + }, + { + "epoch": 3.3810497614178594, + "grad_norm": 0.7746227383613586, + "learning_rate": 0.0002, + "loss": 1.4773, + "step": 2480 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.7074846029281616, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 2490 + }, + { + "epoch": 3.4083162917518743, + "grad_norm": 0.6547690033912659, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 2500 + }, + { + "epoch": 3.4219495569188823, + "grad_norm": 0.784721314907074, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2510 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.7270277738571167, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 2520 + }, + { + "epoch": 3.449216087252897, + "grad_norm": 0.67588871717453, + "learning_rate": 0.0002, + "loss": 1.4354, + "step": 2530 + }, + { + "epoch": 3.4628493524199047, + "grad_norm": 0.6768023371696472, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2540 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.7026481628417969, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 2550 + }, + { + "epoch": 3.4901158827539196, + "grad_norm": 0.646075963973999, + "learning_rate": 0.0002, + "loss": 1.468, + "step": 2560 + }, + { + "epoch": 3.503749147920927, + "grad_norm": 0.6288973689079285, + "learning_rate": 0.0002, + "loss": 1.4058, + "step": 2570 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.6440825462341309, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 2580 + }, + { + "epoch": 3.531015678254942, + "grad_norm": 0.7074111700057983, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2590 + }, + { + "epoch": 3.5446489434219495, + "grad_norm": 0.7007562518119812, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2600 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6045376658439636, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 2610 + }, + { + "epoch": 3.5719154737559644, + "grad_norm": 0.9149952530860901, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 2620 + }, + { + "epoch": 3.585548738922972, + "grad_norm": 0.6490362882614136, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2630 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.6552226543426514, + "learning_rate": 0.0002, + "loss": 1.4107, + "step": 2640 + }, + { + "epoch": 3.612815269256987, + "grad_norm": 0.6541850566864014, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 2650 + }, + { + "epoch": 3.6264485344239947, + "grad_norm": 0.6500770449638367, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 2660 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.6345893740653992, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2670 + }, + { + "epoch": 3.6537150647580097, + "grad_norm": 0.6382275223731995, + "learning_rate": 0.0002, + "loss": 1.3634, + "step": 2680 + }, + { + "epoch": 3.667348329925017, + "grad_norm": 0.6738566160202026, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 2690 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.7446315288543701, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 2700 + }, + { + "epoch": 3.694614860259032, + "grad_norm": 0.6717571020126343, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 2710 + }, + { + "epoch": 3.7082481254260395, + "grad_norm": 0.667259693145752, + "learning_rate": 0.0002, + "loss": 1.4285, + "step": 2720 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.6808622479438782, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 2730 + }, + { + "epoch": 3.7355146557600545, + "grad_norm": 0.7254287004470825, + "learning_rate": 0.0002, + "loss": 1.4297, + "step": 2740 + }, + { + "epoch": 3.749147920927062, + "grad_norm": 0.6864007711410522, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 2750 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.7041361331939697, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 2760 + }, + { + "epoch": 3.776414451261077, + "grad_norm": 0.6559903025627136, + "learning_rate": 0.0002, + "loss": 1.4284, + "step": 2770 + }, + { + "epoch": 3.7900477164280844, + "grad_norm": 0.6602269411087036, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 2780 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.692611813545227, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 2790 + }, + { + "epoch": 3.8173142467620993, + "grad_norm": 0.7051475644111633, + "learning_rate": 0.0002, + "loss": 1.4065, + "step": 2800 + }, + { + "epoch": 3.830947511929107, + "grad_norm": 0.6685371398925781, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 2810 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.6706477403640747, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2820 + }, + { + "epoch": 3.858214042263122, + "grad_norm": 0.6671637296676636, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 2830 + }, + { + "epoch": 3.8718473074301296, + "grad_norm": 0.694092333316803, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 2840 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.7349600195884705, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 2850 + }, + { + "epoch": 3.8991138377641446, + "grad_norm": 0.6647971868515015, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 2860 + }, + { + "epoch": 3.912747102931152, + "grad_norm": 0.806656539440155, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2870 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.6008772850036621, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2880 + }, + { + "epoch": 3.940013633265167, + "grad_norm": 0.659227728843689, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2890 + }, + { + "epoch": 3.9536468984321744, + "grad_norm": 0.6357656717300415, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2900 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.6541687846183777, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2910 + }, + { + "epoch": 3.9809134287661894, + "grad_norm": 0.6090909838676453, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2920 + }, + { + "epoch": 3.994546693933197, + "grad_norm": 0.7198411822319031, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2930 + }, + { + "epoch": 4.0, + "eval_loss": 1.9278366565704346, + "eval_runtime": 53.6567, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 2934 + }, + { + "epoch": 4.008179959100205, + "grad_norm": 0.6498575210571289, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 2940 + }, + { + "epoch": 4.021813224267212, + "grad_norm": 0.865602433681488, + "learning_rate": 0.0002, + "loss": 1.2075, + "step": 2950 + }, + { + "epoch": 4.03544648943422, + "grad_norm": 0.8514999151229858, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 2960 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 1.0677322149276733, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 2970 + }, + { + "epoch": 4.062713019768235, + "grad_norm": 1.0126488208770752, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2980 + }, + { + "epoch": 4.076346284935242, + "grad_norm": 1.0008870363235474, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 2990 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.7942054271697998, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 3000 + }, + { + "epoch": 4.103612815269257, + "grad_norm": 1.0482100248336792, + "learning_rate": 0.0002, + "loss": 1.214, + "step": 3010 + }, + { + "epoch": 4.1172460804362645, + "grad_norm": 1.0516992807388306, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 3020 + }, + { + "epoch": 4.130879345603272, + "grad_norm": 0.8144322037696838, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3030 + }, + { + "epoch": 4.144512610770279, + "grad_norm": 0.952297568321228, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 3040 + }, + { + "epoch": 4.158145875937287, + "grad_norm": 1.007645606994629, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 3050 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 1.0480353832244873, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 3060 + }, + { + "epoch": 4.185412406271302, + "grad_norm": 0.9270663857460022, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 3070 + }, + { + "epoch": 4.199045671438309, + "grad_norm": 1.3415262699127197, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 3080 + }, + { + "epoch": 4.212678936605317, + "grad_norm": 1.167606234550476, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 3090 + }, + { + "epoch": 4.226312201772324, + "grad_norm": 0.9418690800666809, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 3100 + }, + { + "epoch": 4.239945466939332, + "grad_norm": 1.0885876417160034, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 3110 + }, + { + "epoch": 4.253578732106339, + "grad_norm": 0.9165483713150024, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 3120 + }, + { + "epoch": 4.267211997273347, + "grad_norm": 0.9154694080352783, + "learning_rate": 0.0002, + "loss": 1.2933, + "step": 3130 + }, + { + "epoch": 4.280845262440354, + "grad_norm": 1.100580096244812, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3140 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.9367576241493225, + "learning_rate": 0.0002, + "loss": 1.251, + "step": 3150 + }, + { + "epoch": 4.308111792774369, + "grad_norm": 0.9744015336036682, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3160 + }, + { + "epoch": 4.321745057941377, + "grad_norm": 0.9865175485610962, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 3170 + }, + { + "epoch": 4.335378323108385, + "grad_norm": 1.0124907493591309, + "learning_rate": 0.0002, + "loss": 1.2161, + "step": 3180 + }, + { + "epoch": 4.349011588275392, + "grad_norm": 1.1044819355010986, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 3190 + }, + { + "epoch": 4.3626448534424, + "grad_norm": 0.9305577278137207, + "learning_rate": 0.0002, + "loss": 1.2483, + "step": 3200 + }, + { + "epoch": 4.376278118609407, + "grad_norm": 0.969265341758728, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3210 + }, + { + "epoch": 4.389911383776415, + "grad_norm": 1.0671923160552979, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 3220 + }, + { + "epoch": 4.403544648943422, + "grad_norm": 0.9440539479255676, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 3230 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.9824562668800354, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 3240 + }, + { + "epoch": 4.430811179277437, + "grad_norm": 1.0245535373687744, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 3250 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.9629312753677368, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 3260 + }, + { + "epoch": 4.458077709611452, + "grad_norm": 1.1556470394134521, + "learning_rate": 0.0002, + "loss": 1.2689, + "step": 3270 + }, + { + "epoch": 4.47171097477846, + "grad_norm": 0.9796679019927979, + "learning_rate": 0.0002, + "loss": 1.2214, + "step": 3280 + }, + { + "epoch": 4.485344239945467, + "grad_norm": 0.9030535221099854, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3290 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.9142820835113525, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 3300 + }, + { + "epoch": 4.5126107702794815, + "grad_norm": 0.966867208480835, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 3310 + }, + { + "epoch": 4.5262440354464895, + "grad_norm": 1.0127079486846924, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 3320 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 1.055506706237793, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 3330 + }, + { + "epoch": 4.553510565780504, + "grad_norm": 0.9831468462944031, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3340 + }, + { + "epoch": 4.567143830947512, + "grad_norm": 0.9304661154747009, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 3350 + }, + { + "epoch": 4.580777096114519, + "grad_norm": 0.9369107484817505, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3360 + }, + { + "epoch": 4.594410361281527, + "grad_norm": 1.009506344795227, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3370 + }, + { + "epoch": 4.608043626448534, + "grad_norm": 1.0575741529464722, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 3380 + }, + { + "epoch": 4.621676891615542, + "grad_norm": 0.9102860689163208, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 3390 + }, + { + "epoch": 4.635310156782549, + "grad_norm": 0.8111315965652466, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 3400 + }, + { + "epoch": 4.648943421949557, + "grad_norm": 0.9459649920463562, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3410 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.9709545969963074, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 3420 + }, + { + "epoch": 4.676209952283572, + "grad_norm": 0.9909247159957886, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3430 + }, + { + "epoch": 4.689843217450579, + "grad_norm": 0.9094610810279846, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 3440 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.9012220501899719, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3450 + }, + { + "epoch": 4.717109747784594, + "grad_norm": 0.8669242858886719, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 3460 + }, + { + "epoch": 4.730743012951602, + "grad_norm": 0.9753699898719788, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 3470 + }, + { + "epoch": 4.74437627811861, + "grad_norm": 1.0252684354782104, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3480 + }, + { + "epoch": 4.758009543285617, + "grad_norm": 1.208098292350769, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 3490 + }, + { + "epoch": 4.771642808452625, + "grad_norm": 0.8632914423942566, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3500 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 1.0084818601608276, + "learning_rate": 0.0002, + "loss": 1.3062, + "step": 3510 + }, + { + "epoch": 4.79890933878664, + "grad_norm": 0.9095172882080078, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 3520 + }, + { + "epoch": 4.812542603953647, + "grad_norm": 0.9740135669708252, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3530 + }, + { + "epoch": 4.826175869120655, + "grad_norm": 0.8862348794937134, + "learning_rate": 0.0002, + "loss": 1.2816, + "step": 3540 + }, + { + "epoch": 4.839809134287662, + "grad_norm": 1.0761774778366089, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 3550 + }, + { + "epoch": 4.85344239945467, + "grad_norm": 1.0134117603302002, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 3560 + }, + { + "epoch": 4.867075664621677, + "grad_norm": 0.9262851476669312, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 3570 + }, + { + "epoch": 4.8807089297886845, + "grad_norm": 0.9518504738807678, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 3580 + }, + { + "epoch": 4.894342194955692, + "grad_norm": 1.10103178024292, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 3590 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 1.0133225917816162, + "learning_rate": 0.0002, + "loss": 1.2592, + "step": 3600 + }, + { + "epoch": 4.9216087252897065, + "grad_norm": 0.9637737274169922, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 3610 + }, + { + "epoch": 4.935241990456714, + "grad_norm": 0.9800633192062378, + "learning_rate": 0.0002, + "loss": 1.2991, + "step": 3620 + }, + { + "epoch": 4.948875255623722, + "grad_norm": 1.0065973997116089, + "learning_rate": 0.0002, + "loss": 1.2872, + "step": 3630 + }, + { + "epoch": 4.962508520790729, + "grad_norm": 0.9354690313339233, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 3640 + }, + { + "epoch": 4.976141785957737, + "grad_norm": 0.9744119048118591, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 3650 + }, + { + "epoch": 4.989775051124744, + "grad_norm": 0.9357708096504211, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 3660 + }, + { + "epoch": 4.999318336741649, + "eval_loss": 2.0763096809387207, + "eval_runtime": 53.6578, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 3667 + }, + { + "epoch": 5.003408316291752, + "grad_norm": 1.3171669244766235, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3670 + }, + { + "epoch": 5.017041581458759, + "grad_norm": 1.4427374601364136, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 3680 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 0.9313797354698181, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 3690 + }, + { + "epoch": 5.044308111792774, + "grad_norm": 1.417641282081604, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 3700 + }, + { + "epoch": 5.057941376959782, + "grad_norm": 1.097440242767334, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 3710 + }, + { + "epoch": 5.071574642126789, + "grad_norm": 1.4277986288070679, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 3720 + }, + { + "epoch": 5.085207907293797, + "grad_norm": 1.2520873546600342, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 3730 + }, + { + "epoch": 5.098841172460804, + "grad_norm": 1.39503812789917, + "learning_rate": 0.0002, + "loss": 0.9531, + "step": 3740 + }, + { + "epoch": 5.112474437627812, + "grad_norm": 1.2345329523086548, + "learning_rate": 0.0002, + "loss": 0.9658, + "step": 3750 + }, + { + "epoch": 5.126107702794819, + "grad_norm": 1.2700239419937134, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 3760 + }, + { + "epoch": 5.139740967961827, + "grad_norm": 1.5343066453933716, + "learning_rate": 0.0002, + "loss": 0.993, + "step": 3770 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 1.4191608428955078, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 3780 + }, + { + "epoch": 5.167007498295842, + "grad_norm": 1.4591023921966553, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 3790 + }, + { + "epoch": 5.18064076346285, + "grad_norm": 1.6158121824264526, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 3800 + }, + { + "epoch": 5.194274028629857, + "grad_norm": 1.6077582836151123, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 3810 + }, + { + "epoch": 5.207907293796865, + "grad_norm": 1.2815653085708618, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 3820 + }, + { + "epoch": 5.221540558963872, + "grad_norm": 1.2427219152450562, + "learning_rate": 0.0002, + "loss": 1.0131, + "step": 3830 + }, + { + "epoch": 5.23517382413088, + "grad_norm": 1.3013232946395874, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 3840 + }, + { + "epoch": 5.248807089297887, + "grad_norm": 1.4643588066101074, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 3850 + }, + { + "epoch": 5.2624403544648946, + "grad_norm": 1.2571916580200195, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 3860 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 1.226682186126709, + "learning_rate": 0.0002, + "loss": 0.9686, + "step": 3870 + }, + { + "epoch": 5.2897068847989095, + "grad_norm": 1.2541271448135376, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 3880 + }, + { + "epoch": 5.3033401499659165, + "grad_norm": 1.2340261936187744, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 3890 + }, + { + "epoch": 5.316973415132924, + "grad_norm": 1.345527172088623, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 3900 + }, + { + "epoch": 5.3306066802999315, + "grad_norm": 1.2128909826278687, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 3910 + }, + { + "epoch": 5.344239945466939, + "grad_norm": 1.3052637577056885, + "learning_rate": 0.0002, + "loss": 1.0002, + "step": 3920 + }, + { + "epoch": 5.357873210633947, + "grad_norm": 1.1017392873764038, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 3930 + }, + { + "epoch": 5.371506475800954, + "grad_norm": 1.26950204372406, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 3940 + }, + { + "epoch": 5.385139740967962, + "grad_norm": 1.3372546434402466, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 3950 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 1.3115156888961792, + "learning_rate": 0.0002, + "loss": 1.0529, + "step": 3960 + }, + { + "epoch": 5.412406271301977, + "grad_norm": 1.3511474132537842, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 3970 + }, + { + "epoch": 5.426039536468984, + "grad_norm": 1.1001893281936646, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 3980 + }, + { + "epoch": 5.439672801635992, + "grad_norm": 1.2810745239257812, + "learning_rate": 0.0002, + "loss": 1.0855, + "step": 3990 + }, + { + "epoch": 5.453306066802999, + "grad_norm": 1.2999306917190552, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 4000 + }, + { + "epoch": 5.466939331970007, + "grad_norm": 1.172553300857544, + "learning_rate": 0.0002, + "loss": 1.0073, + "step": 4010 + }, + { + "epoch": 5.480572597137014, + "grad_norm": 1.1483557224273682, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 4020 + }, + { + "epoch": 5.494205862304022, + "grad_norm": 1.4148036241531372, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 4030 + }, + { + "epoch": 5.507839127471029, + "grad_norm": 1.1611121892929077, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 4040 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 1.3837119340896606, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 4050 + }, + { + "epoch": 5.535105657805044, + "grad_norm": 1.3025696277618408, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4060 + }, + { + "epoch": 5.548738922972052, + "grad_norm": 1.348091959953308, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 4070 + }, + { + "epoch": 5.56237218813906, + "grad_norm": 1.3463449478149414, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 4080 + }, + { + "epoch": 5.576005453306067, + "grad_norm": 1.3904176950454712, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 4090 + }, + { + "epoch": 5.589638718473074, + "grad_norm": 1.2737950086593628, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 4100 + }, + { + "epoch": 5.603271983640082, + "grad_norm": 1.3311827182769775, + "learning_rate": 0.0002, + "loss": 1.0441, + "step": 4110 + }, + { + "epoch": 5.61690524880709, + "grad_norm": 1.24485182762146, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 4120 + }, + { + "epoch": 5.630538513974097, + "grad_norm": 1.2724957466125488, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 4130 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 1.3439847230911255, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4140 + }, + { + "epoch": 5.657805044308112, + "grad_norm": 1.372359037399292, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4150 + }, + { + "epoch": 5.6714383094751195, + "grad_norm": 1.2322949171066284, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 4160 + }, + { + "epoch": 5.6850715746421265, + "grad_norm": 1.4859193563461304, + "learning_rate": 0.0002, + "loss": 1.0465, + "step": 4170 + }, + { + "epoch": 5.6987048398091344, + "grad_norm": 1.4318448305130005, + "learning_rate": 0.0002, + "loss": 1.1569, + "step": 4180 + }, + { + "epoch": 5.7123381049761415, + "grad_norm": 1.1533565521240234, + "learning_rate": 0.0002, + "loss": 1.017, + "step": 4190 + }, + { + "epoch": 5.725971370143149, + "grad_norm": 1.3009696006774902, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 4200 + }, + { + "epoch": 5.739604635310156, + "grad_norm": 1.3972162008285522, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 4210 + }, + { + "epoch": 5.753237900477164, + "grad_norm": 1.2142186164855957, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 4220 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 1.401191234588623, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4230 + }, + { + "epoch": 5.780504430811179, + "grad_norm": 1.4124404191970825, + "learning_rate": 0.0002, + "loss": 1.0722, + "step": 4240 + }, + { + "epoch": 5.794137695978186, + "grad_norm": 1.3488332033157349, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 4250 + }, + { + "epoch": 5.807770961145194, + "grad_norm": 1.3671752214431763, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 4260 + }, + { + "epoch": 5.821404226312202, + "grad_norm": 1.2608201503753662, + "learning_rate": 0.0002, + "loss": 1.1294, + "step": 4270 + }, + { + "epoch": 5.835037491479209, + "grad_norm": 1.1814045906066895, + "learning_rate": 0.0002, + "loss": 1.1216, + "step": 4280 + }, + { + "epoch": 5.848670756646217, + "grad_norm": 1.4139586687088013, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 4290 + }, + { + "epoch": 5.862304021813224, + "grad_norm": 1.34248948097229, + "learning_rate": 0.0002, + "loss": 1.0656, + "step": 4300 + }, + { + "epoch": 5.875937286980232, + "grad_norm": 1.1428139209747314, + "learning_rate": 0.0002, + "loss": 1.0791, + "step": 4310 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 1.1941087245941162, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 4320 + }, + { + "epoch": 5.903203817314247, + "grad_norm": 1.2374001741409302, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 4330 + }, + { + "epoch": 5.916837082481254, + "grad_norm": 1.4314988851547241, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 4340 + }, + { + "epoch": 5.930470347648262, + "grad_norm": 1.1286126375198364, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 4350 + }, + { + "epoch": 5.944103612815269, + "grad_norm": 1.25884211063385, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 4360 + }, + { + "epoch": 5.957736877982277, + "grad_norm": 1.223357915878296, + "learning_rate": 0.0002, + "loss": 1.1189, + "step": 4370 + }, + { + "epoch": 5.971370143149285, + "grad_norm": 1.2173810005187988, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 4380 + }, + { + "epoch": 5.985003408316292, + "grad_norm": 1.3152292966842651, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 4390 + }, + { + "epoch": 5.998636673483299, + "grad_norm": 1.5576739311218262, + "learning_rate": 0.0002, + "loss": 1.1456, + "step": 4400 + }, + { + "epoch": 6.0, + "eval_loss": 2.3435311317443848, + "eval_runtime": 53.6362, + "eval_samples_per_second": 9.453, + "eval_steps_per_second": 1.193, + "step": 4401 + }, + { + "epoch": 6.012269938650307, + "grad_norm": 2.027981758117676, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 4410 + }, + { + "epoch": 6.025903203817315, + "grad_norm": 1.4775491952896118, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 4420 + }, + { + "epoch": 6.039536468984322, + "grad_norm": 1.6902967691421509, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 4430 + }, + { + "epoch": 6.0531697341513295, + "grad_norm": 1.2506479024887085, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4440 + }, + { + "epoch": 6.0668029993183366, + "grad_norm": 1.5935661792755127, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 4450 + }, + { + "epoch": 6.0804362644853445, + "grad_norm": 1.2966011762619019, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 4460 + }, + { + "epoch": 6.0940695296523515, + "grad_norm": 1.5247948169708252, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 4470 + }, + { + "epoch": 6.107702794819359, + "grad_norm": 1.6415225267410278, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 4480 + }, + { + "epoch": 6.121336059986366, + "grad_norm": 1.5510778427124023, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 4490 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 1.361097812652588, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 4500 + }, + { + "epoch": 6.148602590320381, + "grad_norm": 1.8347383737564087, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 4510 + }, + { + "epoch": 6.162235855487389, + "grad_norm": 1.570560097694397, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4520 + }, + { + "epoch": 6.175869120654396, + "grad_norm": 1.517993688583374, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4530 + }, + { + "epoch": 6.189502385821404, + "grad_norm": 1.4517489671707153, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 4540 + }, + { + "epoch": 6.203135650988412, + "grad_norm": 1.557098627090454, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 4550 + }, + { + "epoch": 6.216768916155419, + "grad_norm": 1.7379891872406006, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4560 + }, + { + "epoch": 6.230402181322427, + "grad_norm": 2.2292542457580566, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 4570 + }, + { + "epoch": 6.244035446489434, + "grad_norm": 1.834366798400879, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4580 + }, + { + "epoch": 6.257668711656442, + "grad_norm": 1.6755090951919556, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 4590 + }, + { + "epoch": 6.271301976823449, + "grad_norm": 1.828898549079895, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4600 + }, + { + "epoch": 6.284935241990457, + "grad_norm": 1.9773457050323486, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 4610 + }, + { + "epoch": 6.298568507157464, + "grad_norm": 1.533369541168213, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4620 + }, + { + "epoch": 6.312201772324472, + "grad_norm": 1.5432997941970825, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4630 + }, + { + "epoch": 6.325835037491479, + "grad_norm": 1.6686866283416748, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 4640 + }, + { + "epoch": 6.339468302658487, + "grad_norm": 1.545304298400879, + "learning_rate": 0.0002, + "loss": 0.8656, + "step": 4650 + }, + { + "epoch": 6.353101567825494, + "grad_norm": 1.5981945991516113, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 4660 + }, + { + "epoch": 6.366734832992502, + "grad_norm": 1.6973154544830322, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4670 + }, + { + "epoch": 6.38036809815951, + "grad_norm": 1.6782612800598145, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 4680 + }, + { + "epoch": 6.394001363326517, + "grad_norm": 1.5710086822509766, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 4690 + }, + { + "epoch": 6.407634628493525, + "grad_norm": 1.7241147756576538, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 4700 + }, + { + "epoch": 6.421267893660532, + "grad_norm": 1.7736736536026, + "learning_rate": 0.0002, + "loss": 0.8768, + "step": 4710 + }, + { + "epoch": 6.4349011588275395, + "grad_norm": 1.7924901247024536, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 4720 + }, + { + "epoch": 6.448534423994547, + "grad_norm": 1.4030500650405884, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 4730 + }, + { + "epoch": 6.4621676891615545, + "grad_norm": 1.6925519704818726, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 4740 + }, + { + "epoch": 6.4758009543285615, + "grad_norm": 1.362905502319336, + "learning_rate": 0.0002, + "loss": 0.8556, + "step": 4750 + }, + { + "epoch": 6.489434219495569, + "grad_norm": 1.5281150341033936, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 4760 + }, + { + "epoch": 6.5030674846625764, + "grad_norm": 1.524671196937561, + "learning_rate": 0.0002, + "loss": 0.8396, + "step": 4770 + }, + { + "epoch": 6.516700749829584, + "grad_norm": 1.7029320001602173, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 4780 + }, + { + "epoch": 6.530334014996591, + "grad_norm": 1.4663511514663696, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 4790 + }, + { + "epoch": 6.543967280163599, + "grad_norm": 1.7682101726531982, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 4800 + }, + { + "epoch": 6.557600545330606, + "grad_norm": 1.6056565046310425, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 4810 + }, + { + "epoch": 6.571233810497614, + "grad_norm": 1.6552391052246094, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 4820 + }, + { + "epoch": 6.584867075664622, + "grad_norm": 1.4265215396881104, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 4830 + }, + { + "epoch": 6.598500340831629, + "grad_norm": 1.6225470304489136, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 4840 + }, + { + "epoch": 6.612133605998636, + "grad_norm": 1.6568684577941895, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 4850 + }, + { + "epoch": 6.625766871165644, + "grad_norm": 1.760115146636963, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 4860 + }, + { + "epoch": 6.639400136332652, + "grad_norm": 1.627966046333313, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 4870 + }, + { + "epoch": 6.653033401499659, + "grad_norm": 1.7053254842758179, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 4880 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.5339484214782715, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 4890 + }, + { + "epoch": 6.680299931833674, + "grad_norm": 1.5594874620437622, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 4900 + }, + { + "epoch": 6.693933197000682, + "grad_norm": 1.5322152376174927, + "learning_rate": 0.0002, + "loss": 0.842, + "step": 4910 + }, + { + "epoch": 6.707566462167689, + "grad_norm": 1.733410358428955, + "learning_rate": 0.0002, + "loss": 0.8049, + "step": 4920 + }, + { + "epoch": 6.721199727334697, + "grad_norm": 1.3626887798309326, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 4930 + }, + { + "epoch": 6.734832992501704, + "grad_norm": 1.6323494911193848, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 4940 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 1.6548917293548584, + "learning_rate": 0.0002, + "loss": 0.8803, + "step": 4950 + }, + { + "epoch": 6.762099522835719, + "grad_norm": 1.7894278764724731, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 4960 + }, + { + "epoch": 6.775732788002727, + "grad_norm": 1.7960841655731201, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 4970 + }, + { + "epoch": 6.789366053169735, + "grad_norm": 1.4888852834701538, + "learning_rate": 0.0002, + "loss": 0.9088, + "step": 4980 + }, + { + "epoch": 6.802999318336742, + "grad_norm": 1.6368865966796875, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 4990 + }, + { + "epoch": 6.816632583503749, + "grad_norm": 1.7106667757034302, + "learning_rate": 0.0002, + "loss": 0.9939, + "step": 5000 + }, + { + "epoch": 6.830265848670757, + "grad_norm": 4.131956100463867, + "learning_rate": 0.0002, + "loss": 0.8551, + "step": 5010 + }, + { + "epoch": 6.8438991138377645, + "grad_norm": 1.6357536315917969, + "learning_rate": 0.0002, + "loss": 0.908, + "step": 5020 + }, + { + "epoch": 6.8575323790047715, + "grad_norm": 1.621524453163147, + "learning_rate": 0.0002, + "loss": 0.8661, + "step": 5030 + }, + { + "epoch": 6.871165644171779, + "grad_norm": 1.6400790214538574, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 5040 + }, + { + "epoch": 6.8847989093387865, + "grad_norm": 1.823006272315979, + "learning_rate": 0.0002, + "loss": 0.9204, + "step": 5050 + }, + { + "epoch": 6.898432174505794, + "grad_norm": 1.6328210830688477, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 5060 + }, + { + "epoch": 6.912065439672801, + "grad_norm": 1.3616089820861816, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 5070 + }, + { + "epoch": 6.925698704839809, + "grad_norm": 1.7202986478805542, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 5080 + }, + { + "epoch": 6.939331970006816, + "grad_norm": 1.8145297765731812, + "learning_rate": 0.0002, + "loss": 0.8331, + "step": 5090 + }, + { + "epoch": 6.952965235173824, + "grad_norm": 1.5432910919189453, + "learning_rate": 0.0002, + "loss": 0.861, + "step": 5100 + }, + { + "epoch": 6.966598500340831, + "grad_norm": 1.2784099578857422, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 5110 + }, + { + "epoch": 6.980231765507839, + "grad_norm": 1.556593894958496, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 5120 + }, + { + "epoch": 6.993865030674847, + "grad_norm": 1.5102856159210205, + "learning_rate": 0.0002, + "loss": 0.8961, + "step": 5130 + }, + { + "epoch": 6.999318336741649, + "eval_loss": 2.5376713275909424, + "eval_runtime": 53.6377, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 5134 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6372715192778752e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5134/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f768e2b866ad8ca4e17f12ea49ff31c2a1602c35 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6864cf11d11ba889d3d40854fe86ffedbe4acbbedf5666c1ae0083cf7c365b +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ed0f4a4563af7bf789dbd43ca6291f63688b98a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:482ece62c7a685946e2fa4d395e6aa8c206323faa5afc76c7f1444078a9a18bf +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5755f6bdc50d684f7d2b2b869417b2febf68290 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45d5dc405ada830755e50aca827264880526a3fa85ccd0b443aee777433ac61c +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa9331c8327df58ec8fc04719d9a463d6bd6c7aa --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad4f28f2c8f846e2a022737f5565cc616369eb1401ac4bc8f8b6b79786bfbac +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..558f2c1c4d941f5fbaaef64c107ae7292753dff1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/trainer_state.json @@ -0,0 +1,4199 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 7.994546693933197, + "eval_steps": 10, + "global_step": 5864, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + }, + { + "epoch": 1.008861622358555, + "grad_norm": 0.3140897750854492, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 740 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.3346109390258789, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 750 + }, + { + "epoch": 1.0361281526925699, + "grad_norm": 0.3582976758480072, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 760 + }, + { + "epoch": 1.0497614178595773, + "grad_norm": 0.30408260226249695, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 770 + }, + { + "epoch": 1.0633946830265848, + "grad_norm": 0.323585569858551, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 780 + }, + { + "epoch": 1.0770279481935923, + "grad_norm": 0.3474137783050537, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 790 + }, + { + "epoch": 1.0906612133606, + "grad_norm": 0.35721147060394287, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 800 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.35366931557655334, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 810 + }, + { + "epoch": 1.117927743694615, + "grad_norm": 0.3250770568847656, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 820 + }, + { + "epoch": 1.1315610088616224, + "grad_norm": 0.3293766379356384, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 830 + }, + { + "epoch": 1.1451942740286298, + "grad_norm": 0.3380851745605469, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 840 + }, + { + "epoch": 1.1588275391956373, + "grad_norm": 0.32584455609321594, + "learning_rate": 0.0002, + "loss": 1.8236, + "step": 850 + }, + { + "epoch": 1.1724608043626448, + "grad_norm": 0.45700767636299133, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 860 + }, + { + "epoch": 1.1860940695296525, + "grad_norm": 0.30944544076919556, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 870 + }, + { + "epoch": 1.19972733469666, + "grad_norm": 0.3268151581287384, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 880 + }, + { + "epoch": 1.2133605998636674, + "grad_norm": 0.39972540736198425, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 890 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.7890929579734802, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 900 + }, + { + "epoch": 1.2406271301976823, + "grad_norm": 0.3439182639122009, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 910 + }, + { + "epoch": 1.2542603953646898, + "grad_norm": 0.3986225128173828, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 920 + }, + { + "epoch": 1.2678936605316973, + "grad_norm": 0.3514605164527893, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 930 + }, + { + "epoch": 1.2815269256987047, + "grad_norm": 0.3682589530944824, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 940 + }, + { + "epoch": 1.2951601908657122, + "grad_norm": 0.3618335723876953, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 950 + }, + { + "epoch": 1.30879345603272, + "grad_norm": 0.345700740814209, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 960 + }, + { + "epoch": 1.3224267211997274, + "grad_norm": 0.3514927923679352, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 970 + }, + { + "epoch": 1.3360599863667348, + "grad_norm": 0.365647554397583, + "learning_rate": 0.0002, + "loss": 1.7704, + "step": 980 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.3407285809516907, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 990 + }, + { + "epoch": 1.3633265167007498, + "grad_norm": 0.3785437345504761, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1000 + }, + { + "epoch": 1.3769597818677572, + "grad_norm": 0.34746724367141724, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1010 + }, + { + "epoch": 1.390593047034765, + "grad_norm": 0.362444132566452, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1020 + }, + { + "epoch": 1.4042263122017724, + "grad_norm": 0.4424704611301422, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1030 + }, + { + "epoch": 1.4178595773687799, + "grad_norm": 0.38722458481788635, + "learning_rate": 0.0002, + "loss": 1.726, + "step": 1040 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.36089080572128296, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 1050 + }, + { + "epoch": 1.4451261077027948, + "grad_norm": 0.33817124366760254, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 1060 + }, + { + "epoch": 1.4587593728698023, + "grad_norm": 0.34334081411361694, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 1070 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.3776826858520508, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1080 + }, + { + "epoch": 1.4860259032038172, + "grad_norm": 0.4169026017189026, + "learning_rate": 0.0002, + "loss": 1.7605, + "step": 1090 + }, + { + "epoch": 1.4996591683708247, + "grad_norm": 0.34898945689201355, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 1100 + }, + { + "epoch": 1.5132924335378322, + "grad_norm": 0.34223780035972595, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 1110 + }, + { + "epoch": 1.5269256987048399, + "grad_norm": 0.3686901032924652, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 1120 + }, + { + "epoch": 1.5405589638718473, + "grad_norm": 0.35054415464401245, + "learning_rate": 0.0002, + "loss": 1.7525, + "step": 1130 + }, + { + "epoch": 1.5541922290388548, + "grad_norm": 0.39496365189552307, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 1140 + }, + { + "epoch": 1.5678254942058623, + "grad_norm": 0.35451626777648926, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 1150 + }, + { + "epoch": 1.58145875937287, + "grad_norm": 0.3848083019256592, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1160 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.3760537803173065, + "learning_rate": 0.0002, + "loss": 1.7272, + "step": 1170 + }, + { + "epoch": 1.6087252897068849, + "grad_norm": 0.38981738686561584, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1180 + }, + { + "epoch": 1.6223585548738924, + "grad_norm": 0.36830949783325195, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 1190 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.3405892848968506, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 1200 + }, + { + "epoch": 1.6496250852079073, + "grad_norm": 0.39027872681617737, + "learning_rate": 0.0002, + "loss": 1.7473, + "step": 1210 + }, + { + "epoch": 1.6632583503749148, + "grad_norm": 0.3342694044113159, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 1220 + }, + { + "epoch": 1.6768916155419222, + "grad_norm": 0.3600076735019684, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1230 + }, + { + "epoch": 1.6905248807089297, + "grad_norm": 0.3625542223453522, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1240 + }, + { + "epoch": 1.7041581458759372, + "grad_norm": 0.32170894742012024, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1250 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.3544139862060547, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1260 + }, + { + "epoch": 1.7314246762099523, + "grad_norm": 0.35113027691841125, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1270 + }, + { + "epoch": 1.7450579413769598, + "grad_norm": 0.3499974310398102, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1280 + }, + { + "epoch": 1.7586912065439673, + "grad_norm": 0.3285157382488251, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1290 + }, + { + "epoch": 1.7723244717109747, + "grad_norm": 0.3701961636543274, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1300 + }, + { + "epoch": 1.7859577368779824, + "grad_norm": 0.3301318287849426, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 1310 + }, + { + "epoch": 1.79959100204499, + "grad_norm": 0.37801554799079895, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 1320 + }, + { + "epoch": 1.8132242672119974, + "grad_norm": 0.3726748526096344, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1330 + }, + { + "epoch": 1.8268575323790048, + "grad_norm": 0.4059790074825287, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1340 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.35712096095085144, + "learning_rate": 0.0002, + "loss": 1.7739, + "step": 1350 + }, + { + "epoch": 1.8541240627130198, + "grad_norm": 0.35995328426361084, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1360 + }, + { + "epoch": 1.8677573278800272, + "grad_norm": 0.3679947257041931, + "learning_rate": 0.0002, + "loss": 1.7332, + "step": 1370 + }, + { + "epoch": 1.8813905930470347, + "grad_norm": 0.39645957946777344, + "learning_rate": 0.0002, + "loss": 1.7587, + "step": 1380 + }, + { + "epoch": 1.8950238582140422, + "grad_norm": 0.35288700461387634, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1390 + }, + { + "epoch": 1.9086571233810496, + "grad_norm": 0.32579198479652405, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 1400 + }, + { + "epoch": 1.9222903885480571, + "grad_norm": 0.3856561779975891, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1410 + }, + { + "epoch": 1.9359236537150648, + "grad_norm": 0.39019331336021423, + "learning_rate": 0.0002, + "loss": 1.668, + "step": 1420 + }, + { + "epoch": 1.9495569188820723, + "grad_norm": 0.38006502389907837, + "learning_rate": 0.0002, + "loss": 1.7774, + "step": 1430 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.38100454211235046, + "learning_rate": 0.0002, + "loss": 1.8323, + "step": 1440 + }, + { + "epoch": 1.9768234492160872, + "grad_norm": 0.3405798673629761, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 1450 + }, + { + "epoch": 1.990456714383095, + "grad_norm": 0.36582913994789124, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1460 + }, + { + "epoch": 2.0, + "eval_loss": 1.8178424835205078, + "eval_runtime": 53.6524, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 1.193, + "step": 1467 + }, + { + "epoch": 2.0040899795501024, + "grad_norm": 0.3626647889614105, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 1470 + }, + { + "epoch": 2.01772324471711, + "grad_norm": 0.40171775221824646, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 1480 + }, + { + "epoch": 2.0313565098841173, + "grad_norm": 0.5805319547653198, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 1490 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.41954153776168823, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 1500 + }, + { + "epoch": 2.0586230402181322, + "grad_norm": 0.47190725803375244, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 1510 + }, + { + "epoch": 2.0722563053851397, + "grad_norm": 0.4388456344604492, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1520 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 2.2171926498413086, + "learning_rate": 0.0002, + "loss": 1.5835, + "step": 1530 + }, + { + "epoch": 2.0995228357191547, + "grad_norm": 0.4314221143722534, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 1540 + }, + { + "epoch": 2.113156100886162, + "grad_norm": 0.4154265522956848, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 1550 + }, + { + "epoch": 2.1267893660531696, + "grad_norm": 0.5025539994239807, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 1560 + }, + { + "epoch": 2.140422631220177, + "grad_norm": 0.5410493016242981, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 1570 + }, + { + "epoch": 2.1540558963871845, + "grad_norm": 0.4478487968444824, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 1580 + }, + { + "epoch": 2.1676891615541924, + "grad_norm": 0.4703652560710907, + "learning_rate": 0.0002, + "loss": 1.5536, + "step": 1590 + }, + { + "epoch": 2.1813224267212, + "grad_norm": 0.4555390179157257, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 1600 + }, + { + "epoch": 2.1949556918882074, + "grad_norm": 0.4877263903617859, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 1610 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.48708245158195496, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1620 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.47523951530456543, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 1630 + }, + { + "epoch": 2.23585548738923, + "grad_norm": 0.4889199733734131, + "learning_rate": 0.0002, + "loss": 1.6013, + "step": 1640 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.4585252106189728, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 1650 + }, + { + "epoch": 2.2631220177232447, + "grad_norm": 0.4764868915081024, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1660 + }, + { + "epoch": 2.276755282890252, + "grad_norm": 0.5028976202011108, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 1670 + }, + { + "epoch": 2.2903885480572597, + "grad_norm": 0.46131211519241333, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 1680 + }, + { + "epoch": 2.304021813224267, + "grad_norm": 0.5422874689102173, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 1690 + }, + { + "epoch": 2.3176550783912746, + "grad_norm": 0.47615355253219604, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1700 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.48005548119544983, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1710 + }, + { + "epoch": 2.3449216087252895, + "grad_norm": 0.4387182295322418, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1720 + }, + { + "epoch": 2.358554873892297, + "grad_norm": 0.4487272799015045, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 1730 + }, + { + "epoch": 2.372188139059305, + "grad_norm": 0.5046455264091492, + "learning_rate": 0.0002, + "loss": 1.5089, + "step": 1740 + }, + { + "epoch": 2.3858214042263124, + "grad_norm": 0.4653521180152893, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 1750 + }, + { + "epoch": 2.39945466939332, + "grad_norm": 0.4737723469734192, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 2.4130879345603273, + "grad_norm": 0.4501931071281433, + "learning_rate": 0.0002, + "loss": 1.5933, + "step": 1770 + }, + { + "epoch": 2.426721199727335, + "grad_norm": 0.4772880971431732, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 1780 + }, + { + "epoch": 2.4403544648943423, + "grad_norm": 0.4544616937637329, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 1790 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.488313227891922, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 1800 + }, + { + "epoch": 2.467620995228357, + "grad_norm": 0.5057830214500427, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 1810 + }, + { + "epoch": 2.4812542603953647, + "grad_norm": 0.5049484968185425, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 1820 + }, + { + "epoch": 2.494887525562372, + "grad_norm": 0.44966644048690796, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1830 + }, + { + "epoch": 2.5085207907293796, + "grad_norm": 0.5072630643844604, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1840 + }, + { + "epoch": 2.522154055896387, + "grad_norm": 0.43989792466163635, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1850 + }, + { + "epoch": 2.5357873210633946, + "grad_norm": 1.3504403829574585, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 1860 + }, + { + "epoch": 2.549420586230402, + "grad_norm": 0.46545976400375366, + "learning_rate": 0.0002, + "loss": 1.5681, + "step": 1870 + }, + { + "epoch": 2.5630538513974095, + "grad_norm": 0.4678342044353485, + "learning_rate": 0.0002, + "loss": 1.6368, + "step": 1880 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.529755711555481, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 1890 + }, + { + "epoch": 2.5903203817314244, + "grad_norm": 0.5000199675559998, + "learning_rate": 0.0002, + "loss": 1.5861, + "step": 1900 + }, + { + "epoch": 2.6039536468984323, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 1910 + }, + { + "epoch": 2.61758691206544, + "grad_norm": 0.7920585870742798, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 1920 + }, + { + "epoch": 2.6312201772324473, + "grad_norm": 0.4960342049598694, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 1930 + }, + { + "epoch": 2.6448534423994547, + "grad_norm": 0.5324710011482239, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 1940 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.606343150138855, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 1950 + }, + { + "epoch": 2.6721199727334697, + "grad_norm": 0.53038489818573, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 1960 + }, + { + "epoch": 2.685753237900477, + "grad_norm": 0.4579465091228485, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 1970 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.4541707932949066, + "learning_rate": 0.0002, + "loss": 1.6093, + "step": 1980 + }, + { + "epoch": 2.713019768234492, + "grad_norm": 0.5009395480155945, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 1990 + }, + { + "epoch": 2.7266530334014996, + "grad_norm": 0.4723006784915924, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 2000 + }, + { + "epoch": 2.740286298568507, + "grad_norm": 0.5086126923561096, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2010 + }, + { + "epoch": 2.7539195637355145, + "grad_norm": 0.47242608666419983, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 2020 + }, + { + "epoch": 2.767552828902522, + "grad_norm": 0.44922566413879395, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2030 + }, + { + "epoch": 2.78118609406953, + "grad_norm": 0.420259565114975, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 2040 + }, + { + "epoch": 2.794819359236537, + "grad_norm": 0.4762881100177765, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 2050 + }, + { + "epoch": 2.808452624403545, + "grad_norm": 0.5228786468505859, + "learning_rate": 0.0002, + "loss": 1.5506, + "step": 2060 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.4796035587787628, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 2070 + }, + { + "epoch": 2.8357191547375598, + "grad_norm": 0.5034735202789307, + "learning_rate": 0.0002, + "loss": 1.6843, + "step": 2080 + }, + { + "epoch": 2.8493524199045672, + "grad_norm": 0.48005399107933044, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 2090 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.578820526599884, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2100 + }, + { + "epoch": 2.876618950238582, + "grad_norm": 0.48982638120651245, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 2110 + }, + { + "epoch": 2.8902522154055896, + "grad_norm": 0.5157325863838196, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2120 + }, + { + "epoch": 2.903885480572597, + "grad_norm": 0.49149683117866516, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 2130 + }, + { + "epoch": 2.9175187457396046, + "grad_norm": 0.48584499955177307, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 2140 + }, + { + "epoch": 2.931152010906612, + "grad_norm": 0.5199017524719238, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 2150 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.5788236856460571, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 2160 + }, + { + "epoch": 2.958418541240627, + "grad_norm": 0.48664185404777527, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 2170 + }, + { + "epoch": 2.9720518064076344, + "grad_norm": 0.5026682615280151, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2180 + }, + { + "epoch": 2.9856850715746424, + "grad_norm": 0.49317044019699097, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 2190 + }, + { + "epoch": 2.9993183367416494, + "grad_norm": 0.5729128122329712, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 2200 + }, + { + "epoch": 2.9993183367416494, + "eval_loss": 1.8527295589447021, + "eval_runtime": 53.6403, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 2200 + }, + { + "epoch": 3.0129516019086573, + "grad_norm": 0.5530241131782532, + "learning_rate": 0.0002, + "loss": 1.4719, + "step": 2210 + }, + { + "epoch": 3.0265848670756648, + "grad_norm": 0.6642216444015503, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 2220 + }, + { + "epoch": 3.0402181322426722, + "grad_norm": 0.61470627784729, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 2230 + }, + { + "epoch": 3.0538513974096797, + "grad_norm": 0.8559566140174866, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 2240 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.7015801668167114, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 2250 + }, + { + "epoch": 3.0811179277436946, + "grad_norm": 0.7226442694664001, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2260 + }, + { + "epoch": 3.094751192910702, + "grad_norm": 0.7560588717460632, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 2270 + }, + { + "epoch": 3.1083844580777096, + "grad_norm": 0.6216568946838379, + "learning_rate": 0.0002, + "loss": 1.4395, + "step": 2280 + }, + { + "epoch": 3.122017723244717, + "grad_norm": 0.6768500804901123, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 2290 + }, + { + "epoch": 3.1356509884117245, + "grad_norm": 0.7028762102127075, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 2300 + }, + { + "epoch": 3.149284253578732, + "grad_norm": 0.6329697966575623, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 2310 + }, + { + "epoch": 3.1629175187457395, + "grad_norm": 0.6328264474868774, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 2320 + }, + { + "epoch": 3.176550783912747, + "grad_norm": 0.7573632001876831, + "learning_rate": 0.0002, + "loss": 1.3762, + "step": 2330 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.595740795135498, + "learning_rate": 0.0002, + "loss": 1.3553, + "step": 2340 + }, + { + "epoch": 3.2038173142467623, + "grad_norm": 0.7111806869506836, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2350 + }, + { + "epoch": 3.2174505794137698, + "grad_norm": 0.6328730583190918, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 2360 + }, + { + "epoch": 3.2310838445807772, + "grad_norm": 0.5860254168510437, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 2370 + }, + { + "epoch": 3.2447171097477847, + "grad_norm": 0.7387157082557678, + "learning_rate": 0.0002, + "loss": 1.4267, + "step": 2380 + }, + { + "epoch": 3.258350374914792, + "grad_norm": 0.6897673606872559, + "learning_rate": 0.0002, + "loss": 1.4837, + "step": 2390 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.7157699465751648, + "learning_rate": 0.0002, + "loss": 1.4372, + "step": 2400 + }, + { + "epoch": 3.285616905248807, + "grad_norm": 0.6422511339187622, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 2410 + }, + { + "epoch": 3.2992501704158146, + "grad_norm": 1.0481886863708496, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 2420 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.7050786018371582, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 2430 + }, + { + "epoch": 3.3265167007498295, + "grad_norm": 0.6090759038925171, + "learning_rate": 0.0002, + "loss": 1.3465, + "step": 2440 + }, + { + "epoch": 3.340149965916837, + "grad_norm": 0.6626465320587158, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 2450 + }, + { + "epoch": 3.3537832310838445, + "grad_norm": 0.6565486788749695, + "learning_rate": 0.0002, + "loss": 1.4512, + "step": 2460 + }, + { + "epoch": 3.367416496250852, + "grad_norm": 0.6449528932571411, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2470 + }, + { + "epoch": 3.3810497614178594, + "grad_norm": 0.7746227383613586, + "learning_rate": 0.0002, + "loss": 1.4773, + "step": 2480 + }, + { + "epoch": 3.3946830265848673, + "grad_norm": 0.7074846029281616, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 2490 + }, + { + "epoch": 3.4083162917518743, + "grad_norm": 0.6547690033912659, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 2500 + }, + { + "epoch": 3.4219495569188823, + "grad_norm": 0.784721314907074, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2510 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.7270277738571167, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 2520 + }, + { + "epoch": 3.449216087252897, + "grad_norm": 0.67588871717453, + "learning_rate": 0.0002, + "loss": 1.4354, + "step": 2530 + }, + { + "epoch": 3.4628493524199047, + "grad_norm": 0.6768023371696472, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 2540 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.7026481628417969, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 2550 + }, + { + "epoch": 3.4901158827539196, + "grad_norm": 0.646075963973999, + "learning_rate": 0.0002, + "loss": 1.468, + "step": 2560 + }, + { + "epoch": 3.503749147920927, + "grad_norm": 0.6288973689079285, + "learning_rate": 0.0002, + "loss": 1.4058, + "step": 2570 + }, + { + "epoch": 3.5173824130879345, + "grad_norm": 0.6440825462341309, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 2580 + }, + { + "epoch": 3.531015678254942, + "grad_norm": 0.7074111700057983, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2590 + }, + { + "epoch": 3.5446489434219495, + "grad_norm": 0.7007562518119812, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2600 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.6045376658439636, + "learning_rate": 0.0002, + "loss": 1.4511, + "step": 2610 + }, + { + "epoch": 3.5719154737559644, + "grad_norm": 0.9149952530860901, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 2620 + }, + { + "epoch": 3.585548738922972, + "grad_norm": 0.6490362882614136, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2630 + }, + { + "epoch": 3.59918200408998, + "grad_norm": 0.6552226543426514, + "learning_rate": 0.0002, + "loss": 1.4107, + "step": 2640 + }, + { + "epoch": 3.612815269256987, + "grad_norm": 0.6541850566864014, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 2650 + }, + { + "epoch": 3.6264485344239947, + "grad_norm": 0.6500770449638367, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 2660 + }, + { + "epoch": 3.640081799591002, + "grad_norm": 0.6345893740653992, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 2670 + }, + { + "epoch": 3.6537150647580097, + "grad_norm": 0.6382275223731995, + "learning_rate": 0.0002, + "loss": 1.3634, + "step": 2680 + }, + { + "epoch": 3.667348329925017, + "grad_norm": 0.6738566160202026, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 2690 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.7446315288543701, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 2700 + }, + { + "epoch": 3.694614860259032, + "grad_norm": 0.6717571020126343, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 2710 + }, + { + "epoch": 3.7082481254260395, + "grad_norm": 0.667259693145752, + "learning_rate": 0.0002, + "loss": 1.4285, + "step": 2720 + }, + { + "epoch": 3.721881390593047, + "grad_norm": 0.6808622479438782, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 2730 + }, + { + "epoch": 3.7355146557600545, + "grad_norm": 0.7254287004470825, + "learning_rate": 0.0002, + "loss": 1.4297, + "step": 2740 + }, + { + "epoch": 3.749147920927062, + "grad_norm": 0.6864007711410522, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 2750 + }, + { + "epoch": 3.7627811860940694, + "grad_norm": 0.7041361331939697, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 2760 + }, + { + "epoch": 3.776414451261077, + "grad_norm": 0.6559903025627136, + "learning_rate": 0.0002, + "loss": 1.4284, + "step": 2770 + }, + { + "epoch": 3.7900477164280844, + "grad_norm": 0.6602269411087036, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 2780 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.692611813545227, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 2790 + }, + { + "epoch": 3.8173142467620993, + "grad_norm": 0.7051475644111633, + "learning_rate": 0.0002, + "loss": 1.4065, + "step": 2800 + }, + { + "epoch": 3.830947511929107, + "grad_norm": 0.6685371398925781, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 2810 + }, + { + "epoch": 3.8445807770961147, + "grad_norm": 0.6706477403640747, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2820 + }, + { + "epoch": 3.858214042263122, + "grad_norm": 0.6671637296676636, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 2830 + }, + { + "epoch": 3.8718473074301296, + "grad_norm": 0.694092333316803, + "learning_rate": 0.0002, + "loss": 1.4736, + "step": 2840 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.7349600195884705, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 2850 + }, + { + "epoch": 3.8991138377641446, + "grad_norm": 0.6647971868515015, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 2860 + }, + { + "epoch": 3.912747102931152, + "grad_norm": 0.806656539440155, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2870 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.6008772850036621, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 2880 + }, + { + "epoch": 3.940013633265167, + "grad_norm": 0.659227728843689, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2890 + }, + { + "epoch": 3.9536468984321744, + "grad_norm": 0.6357656717300415, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 2900 + }, + { + "epoch": 3.967280163599182, + "grad_norm": 0.6541687846183777, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2910 + }, + { + "epoch": 3.9809134287661894, + "grad_norm": 0.6090909838676453, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2920 + }, + { + "epoch": 3.994546693933197, + "grad_norm": 0.7198411822319031, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2930 + }, + { + "epoch": 4.0, + "eval_loss": 1.9278366565704346, + "eval_runtime": 53.6567, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 2934 + }, + { + "epoch": 4.008179959100205, + "grad_norm": 0.6498575210571289, + "learning_rate": 0.0002, + "loss": 1.3159, + "step": 2940 + }, + { + "epoch": 4.021813224267212, + "grad_norm": 0.865602433681488, + "learning_rate": 0.0002, + "loss": 1.2075, + "step": 2950 + }, + { + "epoch": 4.03544648943422, + "grad_norm": 0.8514999151229858, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 2960 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 1.0677322149276733, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 2970 + }, + { + "epoch": 4.062713019768235, + "grad_norm": 1.0126488208770752, + "learning_rate": 0.0002, + "loss": 1.1962, + "step": 2980 + }, + { + "epoch": 4.076346284935242, + "grad_norm": 1.0008870363235474, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 2990 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.7942054271697998, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 3000 + }, + { + "epoch": 4.103612815269257, + "grad_norm": 1.0482100248336792, + "learning_rate": 0.0002, + "loss": 1.214, + "step": 3010 + }, + { + "epoch": 4.1172460804362645, + "grad_norm": 1.0516992807388306, + "learning_rate": 0.0002, + "loss": 1.1999, + "step": 3020 + }, + { + "epoch": 4.130879345603272, + "grad_norm": 0.8144322037696838, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3030 + }, + { + "epoch": 4.144512610770279, + "grad_norm": 0.952297568321228, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 3040 + }, + { + "epoch": 4.158145875937287, + "grad_norm": 1.007645606994629, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 3050 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 1.0480353832244873, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 3060 + }, + { + "epoch": 4.185412406271302, + "grad_norm": 0.9270663857460022, + "learning_rate": 0.0002, + "loss": 1.196, + "step": 3070 + }, + { + "epoch": 4.199045671438309, + "grad_norm": 1.3415262699127197, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 3080 + }, + { + "epoch": 4.212678936605317, + "grad_norm": 1.167606234550476, + "learning_rate": 0.0002, + "loss": 1.2601, + "step": 3090 + }, + { + "epoch": 4.226312201772324, + "grad_norm": 0.9418690800666809, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 3100 + }, + { + "epoch": 4.239945466939332, + "grad_norm": 1.0885876417160034, + "learning_rate": 0.0002, + "loss": 1.2184, + "step": 3110 + }, + { + "epoch": 4.253578732106339, + "grad_norm": 0.9165483713150024, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 3120 + }, + { + "epoch": 4.267211997273347, + "grad_norm": 0.9154694080352783, + "learning_rate": 0.0002, + "loss": 1.2933, + "step": 3130 + }, + { + "epoch": 4.280845262440354, + "grad_norm": 1.100580096244812, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 3140 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.9367576241493225, + "learning_rate": 0.0002, + "loss": 1.251, + "step": 3150 + }, + { + "epoch": 4.308111792774369, + "grad_norm": 0.9744015336036682, + "learning_rate": 0.0002, + "loss": 1.2032, + "step": 3160 + }, + { + "epoch": 4.321745057941377, + "grad_norm": 0.9865175485610962, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 3170 + }, + { + "epoch": 4.335378323108385, + "grad_norm": 1.0124907493591309, + "learning_rate": 0.0002, + "loss": 1.2161, + "step": 3180 + }, + { + "epoch": 4.349011588275392, + "grad_norm": 1.1044819355010986, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 3190 + }, + { + "epoch": 4.3626448534424, + "grad_norm": 0.9305577278137207, + "learning_rate": 0.0002, + "loss": 1.2483, + "step": 3200 + }, + { + "epoch": 4.376278118609407, + "grad_norm": 0.969265341758728, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3210 + }, + { + "epoch": 4.389911383776415, + "grad_norm": 1.0671923160552979, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 3220 + }, + { + "epoch": 4.403544648943422, + "grad_norm": 0.9440539479255676, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 3230 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.9824562668800354, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 3240 + }, + { + "epoch": 4.430811179277437, + "grad_norm": 1.0245535373687744, + "learning_rate": 0.0002, + "loss": 1.2234, + "step": 3250 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.9629312753677368, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 3260 + }, + { + "epoch": 4.458077709611452, + "grad_norm": 1.1556470394134521, + "learning_rate": 0.0002, + "loss": 1.2689, + "step": 3270 + }, + { + "epoch": 4.47171097477846, + "grad_norm": 0.9796679019927979, + "learning_rate": 0.0002, + "loss": 1.2214, + "step": 3280 + }, + { + "epoch": 4.485344239945467, + "grad_norm": 0.9030535221099854, + "learning_rate": 0.0002, + "loss": 1.2823, + "step": 3290 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.9142820835113525, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 3300 + }, + { + "epoch": 4.5126107702794815, + "grad_norm": 0.966867208480835, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 3310 + }, + { + "epoch": 4.5262440354464895, + "grad_norm": 1.0127079486846924, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 3320 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 1.055506706237793, + "learning_rate": 0.0002, + "loss": 1.2059, + "step": 3330 + }, + { + "epoch": 4.553510565780504, + "grad_norm": 0.9831468462944031, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3340 + }, + { + "epoch": 4.567143830947512, + "grad_norm": 0.9304661154747009, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 3350 + }, + { + "epoch": 4.580777096114519, + "grad_norm": 0.9369107484817505, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3360 + }, + { + "epoch": 4.594410361281527, + "grad_norm": 1.009506344795227, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3370 + }, + { + "epoch": 4.608043626448534, + "grad_norm": 1.0575741529464722, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 3380 + }, + { + "epoch": 4.621676891615542, + "grad_norm": 0.9102860689163208, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 3390 + }, + { + "epoch": 4.635310156782549, + "grad_norm": 0.8111315965652466, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 3400 + }, + { + "epoch": 4.648943421949557, + "grad_norm": 0.9459649920463562, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 3410 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.9709545969963074, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 3420 + }, + { + "epoch": 4.676209952283572, + "grad_norm": 0.9909247159957886, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 3430 + }, + { + "epoch": 4.689843217450579, + "grad_norm": 0.9094610810279846, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 3440 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.9012220501899719, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3450 + }, + { + "epoch": 4.717109747784594, + "grad_norm": 0.8669242858886719, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 3460 + }, + { + "epoch": 4.730743012951602, + "grad_norm": 0.9753699898719788, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 3470 + }, + { + "epoch": 4.74437627811861, + "grad_norm": 1.0252684354782104, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3480 + }, + { + "epoch": 4.758009543285617, + "grad_norm": 1.208098292350769, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 3490 + }, + { + "epoch": 4.771642808452625, + "grad_norm": 0.8632914423942566, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3500 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 1.0084818601608276, + "learning_rate": 0.0002, + "loss": 1.3062, + "step": 3510 + }, + { + "epoch": 4.79890933878664, + "grad_norm": 0.9095172882080078, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 3520 + }, + { + "epoch": 4.812542603953647, + "grad_norm": 0.9740135669708252, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3530 + }, + { + "epoch": 4.826175869120655, + "grad_norm": 0.8862348794937134, + "learning_rate": 0.0002, + "loss": 1.2816, + "step": 3540 + }, + { + "epoch": 4.839809134287662, + "grad_norm": 1.0761774778366089, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 3550 + }, + { + "epoch": 4.85344239945467, + "grad_norm": 1.0134117603302002, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 3560 + }, + { + "epoch": 4.867075664621677, + "grad_norm": 0.9262851476669312, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 3570 + }, + { + "epoch": 4.8807089297886845, + "grad_norm": 0.9518504738807678, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 3580 + }, + { + "epoch": 4.894342194955692, + "grad_norm": 1.10103178024292, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 3590 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 1.0133225917816162, + "learning_rate": 0.0002, + "loss": 1.2592, + "step": 3600 + }, + { + "epoch": 4.9216087252897065, + "grad_norm": 0.9637737274169922, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 3610 + }, + { + "epoch": 4.935241990456714, + "grad_norm": 0.9800633192062378, + "learning_rate": 0.0002, + "loss": 1.2991, + "step": 3620 + }, + { + "epoch": 4.948875255623722, + "grad_norm": 1.0065973997116089, + "learning_rate": 0.0002, + "loss": 1.2872, + "step": 3630 + }, + { + "epoch": 4.962508520790729, + "grad_norm": 0.9354690313339233, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 3640 + }, + { + "epoch": 4.976141785957737, + "grad_norm": 0.9744119048118591, + "learning_rate": 0.0002, + "loss": 1.291, + "step": 3650 + }, + { + "epoch": 4.989775051124744, + "grad_norm": 0.9357708096504211, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 3660 + }, + { + "epoch": 4.999318336741649, + "eval_loss": 2.0763096809387207, + "eval_runtime": 53.6578, + "eval_samples_per_second": 9.449, + "eval_steps_per_second": 1.193, + "step": 3667 + }, + { + "epoch": 5.003408316291752, + "grad_norm": 1.3171669244766235, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3670 + }, + { + "epoch": 5.017041581458759, + "grad_norm": 1.4427374601364136, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 3680 + }, + { + "epoch": 5.030674846625767, + "grad_norm": 0.9313797354698181, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 3690 + }, + { + "epoch": 5.044308111792774, + "grad_norm": 1.417641282081604, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 3700 + }, + { + "epoch": 5.057941376959782, + "grad_norm": 1.097440242767334, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 3710 + }, + { + "epoch": 5.071574642126789, + "grad_norm": 1.4277986288070679, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 3720 + }, + { + "epoch": 5.085207907293797, + "grad_norm": 1.2520873546600342, + "learning_rate": 0.0002, + "loss": 0.9718, + "step": 3730 + }, + { + "epoch": 5.098841172460804, + "grad_norm": 1.39503812789917, + "learning_rate": 0.0002, + "loss": 0.9531, + "step": 3740 + }, + { + "epoch": 5.112474437627812, + "grad_norm": 1.2345329523086548, + "learning_rate": 0.0002, + "loss": 0.9658, + "step": 3750 + }, + { + "epoch": 5.126107702794819, + "grad_norm": 1.2700239419937134, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 3760 + }, + { + "epoch": 5.139740967961827, + "grad_norm": 1.5343066453933716, + "learning_rate": 0.0002, + "loss": 0.993, + "step": 3770 + }, + { + "epoch": 5.153374233128835, + "grad_norm": 1.4191608428955078, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 3780 + }, + { + "epoch": 5.167007498295842, + "grad_norm": 1.4591023921966553, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 3790 + }, + { + "epoch": 5.18064076346285, + "grad_norm": 1.6158121824264526, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 3800 + }, + { + "epoch": 5.194274028629857, + "grad_norm": 1.6077582836151123, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 3810 + }, + { + "epoch": 5.207907293796865, + "grad_norm": 1.2815653085708618, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 3820 + }, + { + "epoch": 5.221540558963872, + "grad_norm": 1.2427219152450562, + "learning_rate": 0.0002, + "loss": 1.0131, + "step": 3830 + }, + { + "epoch": 5.23517382413088, + "grad_norm": 1.3013232946395874, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 3840 + }, + { + "epoch": 5.248807089297887, + "grad_norm": 1.4643588066101074, + "learning_rate": 0.0002, + "loss": 0.9862, + "step": 3850 + }, + { + "epoch": 5.2624403544648946, + "grad_norm": 1.2571916580200195, + "learning_rate": 0.0002, + "loss": 1.0149, + "step": 3860 + }, + { + "epoch": 5.276073619631902, + "grad_norm": 1.226682186126709, + "learning_rate": 0.0002, + "loss": 0.9686, + "step": 3870 + }, + { + "epoch": 5.2897068847989095, + "grad_norm": 1.2541271448135376, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 3880 + }, + { + "epoch": 5.3033401499659165, + "grad_norm": 1.2340261936187744, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 3890 + }, + { + "epoch": 5.316973415132924, + "grad_norm": 1.345527172088623, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 3900 + }, + { + "epoch": 5.3306066802999315, + "grad_norm": 1.2128909826278687, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 3910 + }, + { + "epoch": 5.344239945466939, + "grad_norm": 1.3052637577056885, + "learning_rate": 0.0002, + "loss": 1.0002, + "step": 3920 + }, + { + "epoch": 5.357873210633947, + "grad_norm": 1.1017392873764038, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 3930 + }, + { + "epoch": 5.371506475800954, + "grad_norm": 1.26950204372406, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 3940 + }, + { + "epoch": 5.385139740967962, + "grad_norm": 1.3372546434402466, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 3950 + }, + { + "epoch": 5.398773006134969, + "grad_norm": 1.3115156888961792, + "learning_rate": 0.0002, + "loss": 1.0529, + "step": 3960 + }, + { + "epoch": 5.412406271301977, + "grad_norm": 1.3511474132537842, + "learning_rate": 0.0002, + "loss": 1.1179, + "step": 3970 + }, + { + "epoch": 5.426039536468984, + "grad_norm": 1.1001893281936646, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 3980 + }, + { + "epoch": 5.439672801635992, + "grad_norm": 1.2810745239257812, + "learning_rate": 0.0002, + "loss": 1.0855, + "step": 3990 + }, + { + "epoch": 5.453306066802999, + "grad_norm": 1.2999306917190552, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 4000 + }, + { + "epoch": 5.466939331970007, + "grad_norm": 1.172553300857544, + "learning_rate": 0.0002, + "loss": 1.0073, + "step": 4010 + }, + { + "epoch": 5.480572597137014, + "grad_norm": 1.1483557224273682, + "learning_rate": 0.0002, + "loss": 1.003, + "step": 4020 + }, + { + "epoch": 5.494205862304022, + "grad_norm": 1.4148036241531372, + "learning_rate": 0.0002, + "loss": 1.0704, + "step": 4030 + }, + { + "epoch": 5.507839127471029, + "grad_norm": 1.1611121892929077, + "learning_rate": 0.0002, + "loss": 1.0519, + "step": 4040 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 1.3837119340896606, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 4050 + }, + { + "epoch": 5.535105657805044, + "grad_norm": 1.3025696277618408, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4060 + }, + { + "epoch": 5.548738922972052, + "grad_norm": 1.348091959953308, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 4070 + }, + { + "epoch": 5.56237218813906, + "grad_norm": 1.3463449478149414, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 4080 + }, + { + "epoch": 5.576005453306067, + "grad_norm": 1.3904176950454712, + "learning_rate": 0.0002, + "loss": 1.039, + "step": 4090 + }, + { + "epoch": 5.589638718473074, + "grad_norm": 1.2737950086593628, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 4100 + }, + { + "epoch": 5.603271983640082, + "grad_norm": 1.3311827182769775, + "learning_rate": 0.0002, + "loss": 1.0441, + "step": 4110 + }, + { + "epoch": 5.61690524880709, + "grad_norm": 1.24485182762146, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 4120 + }, + { + "epoch": 5.630538513974097, + "grad_norm": 1.2724957466125488, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 4130 + }, + { + "epoch": 5.644171779141105, + "grad_norm": 1.3439847230911255, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4140 + }, + { + "epoch": 5.657805044308112, + "grad_norm": 1.372359037399292, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 4150 + }, + { + "epoch": 5.6714383094751195, + "grad_norm": 1.2322949171066284, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 4160 + }, + { + "epoch": 5.6850715746421265, + "grad_norm": 1.4859193563461304, + "learning_rate": 0.0002, + "loss": 1.0465, + "step": 4170 + }, + { + "epoch": 5.6987048398091344, + "grad_norm": 1.4318448305130005, + "learning_rate": 0.0002, + "loss": 1.1569, + "step": 4180 + }, + { + "epoch": 5.7123381049761415, + "grad_norm": 1.1533565521240234, + "learning_rate": 0.0002, + "loss": 1.017, + "step": 4190 + }, + { + "epoch": 5.725971370143149, + "grad_norm": 1.3009696006774902, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 4200 + }, + { + "epoch": 5.739604635310156, + "grad_norm": 1.3972162008285522, + "learning_rate": 0.0002, + "loss": 1.1229, + "step": 4210 + }, + { + "epoch": 5.753237900477164, + "grad_norm": 1.2142186164855957, + "learning_rate": 0.0002, + "loss": 1.033, + "step": 4220 + }, + { + "epoch": 5.766871165644172, + "grad_norm": 1.401191234588623, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 4230 + }, + { + "epoch": 5.780504430811179, + "grad_norm": 1.4124404191970825, + "learning_rate": 0.0002, + "loss": 1.0722, + "step": 4240 + }, + { + "epoch": 5.794137695978186, + "grad_norm": 1.3488332033157349, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 4250 + }, + { + "epoch": 5.807770961145194, + "grad_norm": 1.3671752214431763, + "learning_rate": 0.0002, + "loss": 1.0599, + "step": 4260 + }, + { + "epoch": 5.821404226312202, + "grad_norm": 1.2608201503753662, + "learning_rate": 0.0002, + "loss": 1.1294, + "step": 4270 + }, + { + "epoch": 5.835037491479209, + "grad_norm": 1.1814045906066895, + "learning_rate": 0.0002, + "loss": 1.1216, + "step": 4280 + }, + { + "epoch": 5.848670756646217, + "grad_norm": 1.4139586687088013, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 4290 + }, + { + "epoch": 5.862304021813224, + "grad_norm": 1.34248948097229, + "learning_rate": 0.0002, + "loss": 1.0656, + "step": 4300 + }, + { + "epoch": 5.875937286980232, + "grad_norm": 1.1428139209747314, + "learning_rate": 0.0002, + "loss": 1.0791, + "step": 4310 + }, + { + "epoch": 5.889570552147239, + "grad_norm": 1.1941087245941162, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 4320 + }, + { + "epoch": 5.903203817314247, + "grad_norm": 1.2374001741409302, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 4330 + }, + { + "epoch": 5.916837082481254, + "grad_norm": 1.4314988851547241, + "learning_rate": 0.0002, + "loss": 1.0802, + "step": 4340 + }, + { + "epoch": 5.930470347648262, + "grad_norm": 1.1286126375198364, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 4350 + }, + { + "epoch": 5.944103612815269, + "grad_norm": 1.25884211063385, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 4360 + }, + { + "epoch": 5.957736877982277, + "grad_norm": 1.223357915878296, + "learning_rate": 0.0002, + "loss": 1.1189, + "step": 4370 + }, + { + "epoch": 5.971370143149285, + "grad_norm": 1.2173810005187988, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 4380 + }, + { + "epoch": 5.985003408316292, + "grad_norm": 1.3152292966842651, + "learning_rate": 0.0002, + "loss": 1.1201, + "step": 4390 + }, + { + "epoch": 5.998636673483299, + "grad_norm": 1.5576739311218262, + "learning_rate": 0.0002, + "loss": 1.1456, + "step": 4400 + }, + { + "epoch": 6.0, + "eval_loss": 2.3435311317443848, + "eval_runtime": 53.6362, + "eval_samples_per_second": 9.453, + "eval_steps_per_second": 1.193, + "step": 4401 + }, + { + "epoch": 6.012269938650307, + "grad_norm": 2.027981758117676, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 4410 + }, + { + "epoch": 6.025903203817315, + "grad_norm": 1.4775491952896118, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 4420 + }, + { + "epoch": 6.039536468984322, + "grad_norm": 1.6902967691421509, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 4430 + }, + { + "epoch": 6.0531697341513295, + "grad_norm": 1.2506479024887085, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4440 + }, + { + "epoch": 6.0668029993183366, + "grad_norm": 1.5935661792755127, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 4450 + }, + { + "epoch": 6.0804362644853445, + "grad_norm": 1.2966011762619019, + "learning_rate": 0.0002, + "loss": 0.7869, + "step": 4460 + }, + { + "epoch": 6.0940695296523515, + "grad_norm": 1.5247948169708252, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 4470 + }, + { + "epoch": 6.107702794819359, + "grad_norm": 1.6415225267410278, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 4480 + }, + { + "epoch": 6.121336059986366, + "grad_norm": 1.5510778427124023, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 4490 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 1.361097812652588, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 4500 + }, + { + "epoch": 6.148602590320381, + "grad_norm": 1.8347383737564087, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 4510 + }, + { + "epoch": 6.162235855487389, + "grad_norm": 1.570560097694397, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4520 + }, + { + "epoch": 6.175869120654396, + "grad_norm": 1.517993688583374, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4530 + }, + { + "epoch": 6.189502385821404, + "grad_norm": 1.4517489671707153, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 4540 + }, + { + "epoch": 6.203135650988412, + "grad_norm": 1.557098627090454, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 4550 + }, + { + "epoch": 6.216768916155419, + "grad_norm": 1.7379891872406006, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4560 + }, + { + "epoch": 6.230402181322427, + "grad_norm": 2.2292542457580566, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 4570 + }, + { + "epoch": 6.244035446489434, + "grad_norm": 1.834366798400879, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4580 + }, + { + "epoch": 6.257668711656442, + "grad_norm": 1.6755090951919556, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 4590 + }, + { + "epoch": 6.271301976823449, + "grad_norm": 1.828898549079895, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4600 + }, + { + "epoch": 6.284935241990457, + "grad_norm": 1.9773457050323486, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 4610 + }, + { + "epoch": 6.298568507157464, + "grad_norm": 1.533369541168213, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4620 + }, + { + "epoch": 6.312201772324472, + "grad_norm": 1.5432997941970825, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4630 + }, + { + "epoch": 6.325835037491479, + "grad_norm": 1.6686866283416748, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 4640 + }, + { + "epoch": 6.339468302658487, + "grad_norm": 1.545304298400879, + "learning_rate": 0.0002, + "loss": 0.8656, + "step": 4650 + }, + { + "epoch": 6.353101567825494, + "grad_norm": 1.5981945991516113, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 4660 + }, + { + "epoch": 6.366734832992502, + "grad_norm": 1.6973154544830322, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4670 + }, + { + "epoch": 6.38036809815951, + "grad_norm": 1.6782612800598145, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 4680 + }, + { + "epoch": 6.394001363326517, + "grad_norm": 1.5710086822509766, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 4690 + }, + { + "epoch": 6.407634628493525, + "grad_norm": 1.7241147756576538, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 4700 + }, + { + "epoch": 6.421267893660532, + "grad_norm": 1.7736736536026, + "learning_rate": 0.0002, + "loss": 0.8768, + "step": 4710 + }, + { + "epoch": 6.4349011588275395, + "grad_norm": 1.7924901247024536, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 4720 + }, + { + "epoch": 6.448534423994547, + "grad_norm": 1.4030500650405884, + "learning_rate": 0.0002, + "loss": 0.832, + "step": 4730 + }, + { + "epoch": 6.4621676891615545, + "grad_norm": 1.6925519704818726, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 4740 + }, + { + "epoch": 6.4758009543285615, + "grad_norm": 1.362905502319336, + "learning_rate": 0.0002, + "loss": 0.8556, + "step": 4750 + }, + { + "epoch": 6.489434219495569, + "grad_norm": 1.5281150341033936, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 4760 + }, + { + "epoch": 6.5030674846625764, + "grad_norm": 1.524671196937561, + "learning_rate": 0.0002, + "loss": 0.8396, + "step": 4770 + }, + { + "epoch": 6.516700749829584, + "grad_norm": 1.7029320001602173, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 4780 + }, + { + "epoch": 6.530334014996591, + "grad_norm": 1.4663511514663696, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 4790 + }, + { + "epoch": 6.543967280163599, + "grad_norm": 1.7682101726531982, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 4800 + }, + { + "epoch": 6.557600545330606, + "grad_norm": 1.6056565046310425, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 4810 + }, + { + "epoch": 6.571233810497614, + "grad_norm": 1.6552391052246094, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 4820 + }, + { + "epoch": 6.584867075664622, + "grad_norm": 1.4265215396881104, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 4830 + }, + { + "epoch": 6.598500340831629, + "grad_norm": 1.6225470304489136, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 4840 + }, + { + "epoch": 6.612133605998636, + "grad_norm": 1.6568684577941895, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 4850 + }, + { + "epoch": 6.625766871165644, + "grad_norm": 1.760115146636963, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 4860 + }, + { + "epoch": 6.639400136332652, + "grad_norm": 1.627966046333313, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 4870 + }, + { + "epoch": 6.653033401499659, + "grad_norm": 1.7053254842758179, + "learning_rate": 0.0002, + "loss": 0.8825, + "step": 4880 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.5339484214782715, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 4890 + }, + { + "epoch": 6.680299931833674, + "grad_norm": 1.5594874620437622, + "learning_rate": 0.0002, + "loss": 0.8482, + "step": 4900 + }, + { + "epoch": 6.693933197000682, + "grad_norm": 1.5322152376174927, + "learning_rate": 0.0002, + "loss": 0.842, + "step": 4910 + }, + { + "epoch": 6.707566462167689, + "grad_norm": 1.733410358428955, + "learning_rate": 0.0002, + "loss": 0.8049, + "step": 4920 + }, + { + "epoch": 6.721199727334697, + "grad_norm": 1.3626887798309326, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 4930 + }, + { + "epoch": 6.734832992501704, + "grad_norm": 1.6323494911193848, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 4940 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 1.6548917293548584, + "learning_rate": 0.0002, + "loss": 0.8803, + "step": 4950 + }, + { + "epoch": 6.762099522835719, + "grad_norm": 1.7894278764724731, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 4960 + }, + { + "epoch": 6.775732788002727, + "grad_norm": 1.7960841655731201, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 4970 + }, + { + "epoch": 6.789366053169735, + "grad_norm": 1.4888852834701538, + "learning_rate": 0.0002, + "loss": 0.9088, + "step": 4980 + }, + { + "epoch": 6.802999318336742, + "grad_norm": 1.6368865966796875, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 4990 + }, + { + "epoch": 6.816632583503749, + "grad_norm": 1.7106667757034302, + "learning_rate": 0.0002, + "loss": 0.9939, + "step": 5000 + }, + { + "epoch": 6.830265848670757, + "grad_norm": 4.131956100463867, + "learning_rate": 0.0002, + "loss": 0.8551, + "step": 5010 + }, + { + "epoch": 6.8438991138377645, + "grad_norm": 1.6357536315917969, + "learning_rate": 0.0002, + "loss": 0.908, + "step": 5020 + }, + { + "epoch": 6.8575323790047715, + "grad_norm": 1.621524453163147, + "learning_rate": 0.0002, + "loss": 0.8661, + "step": 5030 + }, + { + "epoch": 6.871165644171779, + "grad_norm": 1.6400790214538574, + "learning_rate": 0.0002, + "loss": 0.9177, + "step": 5040 + }, + { + "epoch": 6.8847989093387865, + "grad_norm": 1.823006272315979, + "learning_rate": 0.0002, + "loss": 0.9204, + "step": 5050 + }, + { + "epoch": 6.898432174505794, + "grad_norm": 1.6328210830688477, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 5060 + }, + { + "epoch": 6.912065439672801, + "grad_norm": 1.3616089820861816, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 5070 + }, + { + "epoch": 6.925698704839809, + "grad_norm": 1.7202986478805542, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 5080 + }, + { + "epoch": 6.939331970006816, + "grad_norm": 1.8145297765731812, + "learning_rate": 0.0002, + "loss": 0.8331, + "step": 5090 + }, + { + "epoch": 6.952965235173824, + "grad_norm": 1.5432910919189453, + "learning_rate": 0.0002, + "loss": 0.861, + "step": 5100 + }, + { + "epoch": 6.966598500340831, + "grad_norm": 1.2784099578857422, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 5110 + }, + { + "epoch": 6.980231765507839, + "grad_norm": 1.556593894958496, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 5120 + }, + { + "epoch": 6.993865030674847, + "grad_norm": 1.5102856159210205, + "learning_rate": 0.0002, + "loss": 0.8961, + "step": 5130 + }, + { + "epoch": 6.999318336741649, + "eval_loss": 2.5376713275909424, + "eval_runtime": 53.6377, + "eval_samples_per_second": 9.452, + "eval_steps_per_second": 1.193, + "step": 5134 + }, + { + "epoch": 7.007498295841854, + "grad_norm": 1.7083442211151123, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 5140 + }, + { + "epoch": 7.021131561008862, + "grad_norm": 1.95943021774292, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 5150 + }, + { + "epoch": 7.034764826175869, + "grad_norm": 1.453168511390686, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 5160 + }, + { + "epoch": 7.048398091342877, + "grad_norm": 2.110145092010498, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 5170 + }, + { + "epoch": 7.062031356509884, + "grad_norm": 1.567636489868164, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 5180 + }, + { + "epoch": 7.075664621676892, + "grad_norm": 1.8596835136413574, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 5190 + }, + { + "epoch": 7.089297886843899, + "grad_norm": 1.7342605590820312, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 5200 + }, + { + "epoch": 7.102931152010907, + "grad_norm": 1.516591191291809, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 5210 + }, + { + "epoch": 7.116564417177914, + "grad_norm": 1.7696505784988403, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 5220 + }, + { + "epoch": 7.130197682344922, + "grad_norm": 2.1680636405944824, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 5230 + }, + { + "epoch": 7.143830947511929, + "grad_norm": 1.6825456619262695, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 5240 + }, + { + "epoch": 7.157464212678937, + "grad_norm": 2.036949634552002, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 5250 + }, + { + "epoch": 7.171097477845944, + "grad_norm": 1.8820315599441528, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 5260 + }, + { + "epoch": 7.184730743012952, + "grad_norm": 2.313140630722046, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 5270 + }, + { + "epoch": 7.198364008179959, + "grad_norm": 2.0305309295654297, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 5280 + }, + { + "epoch": 7.211997273346967, + "grad_norm": 1.707711100578308, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 5290 + }, + { + "epoch": 7.2256305385139745, + "grad_norm": 1.687009334564209, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 5300 + }, + { + "epoch": 7.2392638036809815, + "grad_norm": 2.0011701583862305, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 5310 + }, + { + "epoch": 7.2528970688479895, + "grad_norm": 1.9455368518829346, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 5320 + }, + { + "epoch": 7.2665303340149965, + "grad_norm": 1.5780237913131714, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 5330 + }, + { + "epoch": 7.280163599182004, + "grad_norm": 2.1882123947143555, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 5340 + }, + { + "epoch": 7.293796864349011, + "grad_norm": 2.089590549468994, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 5350 + }, + { + "epoch": 7.307430129516019, + "grad_norm": 1.8626707792282104, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 5360 + }, + { + "epoch": 7.321063394683026, + "grad_norm": 2.127977132797241, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 5370 + }, + { + "epoch": 7.334696659850034, + "grad_norm": 1.6568187475204468, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 5380 + }, + { + "epoch": 7.348329925017041, + "grad_norm": 1.5592522621154785, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 5390 + }, + { + "epoch": 7.361963190184049, + "grad_norm": 1.7897852659225464, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 5400 + }, + { + "epoch": 7.375596455351056, + "grad_norm": 2.071516275405884, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 5410 + }, + { + "epoch": 7.389229720518064, + "grad_norm": 2.048238515853882, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 5420 + }, + { + "epoch": 7.402862985685071, + "grad_norm": 1.770015001296997, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 5430 + }, + { + "epoch": 7.416496250852079, + "grad_norm": 1.7530136108398438, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 5440 + }, + { + "epoch": 7.430129516019086, + "grad_norm": 1.8113389015197754, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 5450 + }, + { + "epoch": 7.443762781186094, + "grad_norm": 1.8129119873046875, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 5460 + }, + { + "epoch": 7.457396046353102, + "grad_norm": 1.7961417436599731, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 5470 + }, + { + "epoch": 7.471029311520109, + "grad_norm": 1.8811243772506714, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 5480 + }, + { + "epoch": 7.484662576687117, + "grad_norm": 1.9619536399841309, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 5490 + }, + { + "epoch": 7.498295841854124, + "grad_norm": 1.9449920654296875, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 5500 + }, + { + "epoch": 7.511929107021132, + "grad_norm": 2.0600240230560303, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 5510 + }, + { + "epoch": 7.525562372188139, + "grad_norm": 1.9339587688446045, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 5520 + }, + { + "epoch": 7.539195637355147, + "grad_norm": 2.0672056674957275, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 5530 + }, + { + "epoch": 7.552828902522154, + "grad_norm": 1.8305774927139282, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 5540 + }, + { + "epoch": 7.566462167689162, + "grad_norm": 1.9546589851379395, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 5550 + }, + { + "epoch": 7.580095432856169, + "grad_norm": 1.657498836517334, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 5560 + }, + { + "epoch": 7.593728698023177, + "grad_norm": 2.0222396850585938, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 5570 + }, + { + "epoch": 7.6073619631901845, + "grad_norm": 1.9352941513061523, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 5580 + }, + { + "epoch": 7.620995228357192, + "grad_norm": 1.9743294715881348, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 5590 + }, + { + "epoch": 7.634628493524199, + "grad_norm": 1.949228048324585, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 5600 + }, + { + "epoch": 7.6482617586912065, + "grad_norm": 2.009384870529175, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 5610 + }, + { + "epoch": 7.661895023858214, + "grad_norm": 1.9622714519500732, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 5620 + }, + { + "epoch": 7.675528289025221, + "grad_norm": 2.142486810684204, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 5630 + }, + { + "epoch": 7.689161554192229, + "grad_norm": 2.4306538105010986, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 5640 + }, + { + "epoch": 7.702794819359236, + "grad_norm": 1.8343422412872314, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 5650 + }, + { + "epoch": 7.716428084526244, + "grad_norm": 2.1571617126464844, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 5660 + }, + { + "epoch": 7.730061349693251, + "grad_norm": 2.028083086013794, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 5670 + }, + { + "epoch": 7.743694614860259, + "grad_norm": 2.0310823917388916, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5680 + }, + { + "epoch": 7.757327880027266, + "grad_norm": 1.9675135612487793, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 5690 + }, + { + "epoch": 7.770961145194274, + "grad_norm": 2.082470417022705, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 5700 + }, + { + "epoch": 7.784594410361281, + "grad_norm": 1.8454886674880981, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 5710 + }, + { + "epoch": 7.798227675528289, + "grad_norm": 2.0777692794799805, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 5720 + }, + { + "epoch": 7.811860940695297, + "grad_norm": 1.751173496246338, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5730 + }, + { + "epoch": 7.825494205862304, + "grad_norm": 1.7728252410888672, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 5740 + }, + { + "epoch": 7.839127471029311, + "grad_norm": 1.9239917993545532, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 5750 + }, + { + "epoch": 7.852760736196319, + "grad_norm": 2.0526111125946045, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 5760 + }, + { + "epoch": 7.866394001363327, + "grad_norm": 2.097938060760498, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 5770 + }, + { + "epoch": 7.880027266530334, + "grad_norm": 1.8992373943328857, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 5780 + }, + { + "epoch": 7.893660531697342, + "grad_norm": 1.812042474746704, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 5790 + }, + { + "epoch": 7.907293796864349, + "grad_norm": 1.9535222053527832, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 5800 + }, + { + "epoch": 7.920927062031357, + "grad_norm": 2.0650830268859863, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 5810 + }, + { + "epoch": 7.934560327198364, + "grad_norm": 1.818130612373352, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 5820 + }, + { + "epoch": 7.948193592365372, + "grad_norm": 1.9505265951156616, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 5830 + }, + { + "epoch": 7.961826857532379, + "grad_norm": 2.072112798690796, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 5840 + }, + { + "epoch": 7.975460122699387, + "grad_norm": 1.6640431880950928, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 5850 + }, + { + "epoch": 7.989093387866394, + "grad_norm": 1.7920113801956177, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 5860 + }, + { + "epoch": 7.994546693933197, + "eval_loss": 2.979367971420288, + "eval_runtime": 53.6165, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 1.194, + "step": 5864 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.0119700436353024e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5864/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e98db163734cc03f7a8f8b3f720d3a2befdf7453 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..25a6157dd64c1895e96eba28c31bfe1297c5fb71 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81567cf32cadbaf0117b7ad6c654a6b5fedc1775d91f1bdb0c88027d5c8d77d +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0023ef17d16e73687bc471a0c75ff8c35920f5fa --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b56b40b5e5c72dc13141f66816e8b3ddecc7cac5aa6d0b9d49b94287429d35 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d745c8a6d6d00d2d456e7c8d01e7c8ce575151e5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d13d378f48a3b99f4cf51bd02b734679f943a59db3e7293a9045d716b1a0cc3 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..480830aefb6dd526d5a03844c2ac28aaf126b1ca --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd2e07a4b2fdc681e2ebaa6e83774d2b06a4ddc96f491816d22089322567c0c +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..57f085b44006633b9981cd4bc049ccf265b3a38f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/trainer_state.json @@ -0,0 +1,552 @@ +{ + "best_metric": 1.8171186447143555, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", + "epoch": 0.9993183367416496, + "eval_steps": 10, + "global_step": 733, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013633265167007498, + "grad_norm": 0.7714291214942932, + "learning_rate": 0.0002, + "loss": 3.0982, + "step": 10 + }, + { + "epoch": 0.027266530334014997, + "grad_norm": 0.5473978519439697, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 20 + }, + { + "epoch": 0.0408997955010225, + "grad_norm": 0.5452795624732971, + "learning_rate": 0.0002, + "loss": 2.3079, + "step": 30 + }, + { + "epoch": 0.054533060668029994, + "grad_norm": 0.5098028779029846, + "learning_rate": 0.0002, + "loss": 2.0019, + "step": 40 + }, + { + "epoch": 0.0681663258350375, + "grad_norm": 0.48062971234321594, + "learning_rate": 0.0002, + "loss": 1.9333, + "step": 50 + }, + { + "epoch": 0.081799591002045, + "grad_norm": 0.4505695104598999, + "learning_rate": 0.0002, + "loss": 1.9355, + "step": 60 + }, + { + "epoch": 0.09543285616905249, + "grad_norm": 0.41609591245651245, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 70 + }, + { + "epoch": 0.10906612133605999, + "grad_norm": 0.4323892593383789, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 80 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 0.4670293629169464, + "learning_rate": 0.0002, + "loss": 1.9294, + "step": 90 + }, + { + "epoch": 0.136332651670075, + "grad_norm": 0.40623316168785095, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 100 + }, + { + "epoch": 0.1499659168370825, + "grad_norm": 0.3620383143424988, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 110 + }, + { + "epoch": 0.16359918200409, + "grad_norm": 0.332218736410141, + "learning_rate": 0.0002, + "loss": 1.9238, + "step": 120 + }, + { + "epoch": 0.17723244717109748, + "grad_norm": 0.4004521667957306, + "learning_rate": 0.0002, + "loss": 1.93, + "step": 130 + }, + { + "epoch": 0.19086571233810498, + "grad_norm": 0.3698360323905945, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 140 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.3847949504852295, + "learning_rate": 0.0002, + "loss": 1.8771, + "step": 150 + }, + { + "epoch": 0.21813224267211997, + "grad_norm": 0.36843451857566833, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 160 + }, + { + "epoch": 0.23176550783912747, + "grad_norm": 0.37301021814346313, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 170 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 0.3718886971473694, + "learning_rate": 0.0002, + "loss": 1.8909, + "step": 180 + }, + { + "epoch": 0.25903203817314246, + "grad_norm": 0.3088490962982178, + "learning_rate": 0.0002, + "loss": 1.8454, + "step": 190 + }, + { + "epoch": 0.27266530334015, + "grad_norm": 0.3611852526664734, + "learning_rate": 0.0002, + "loss": 1.9254, + "step": 200 + }, + { + "epoch": 0.28629856850715746, + "grad_norm": 0.36093324422836304, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 210 + }, + { + "epoch": 0.299931833674165, + "grad_norm": 0.3250400722026825, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 220 + }, + { + "epoch": 0.31356509884117245, + "grad_norm": 0.3566756248474121, + "learning_rate": 0.0002, + "loss": 1.8729, + "step": 230 + }, + { + "epoch": 0.32719836400818, + "grad_norm": 0.32872408628463745, + "learning_rate": 0.0002, + "loss": 1.9259, + "step": 240 + }, + { + "epoch": 0.34083162917518744, + "grad_norm": 0.3983881175518036, + "learning_rate": 0.0002, + "loss": 1.9033, + "step": 250 + }, + { + "epoch": 0.35446489434219497, + "grad_norm": 0.3571510910987854, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 260 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.3036131262779236, + "learning_rate": 0.0002, + "loss": 1.8539, + "step": 270 + }, + { + "epoch": 0.38173142467620996, + "grad_norm": 0.36512863636016846, + "learning_rate": 0.0002, + "loss": 1.8572, + "step": 280 + }, + { + "epoch": 0.39536468984321743, + "grad_norm": 0.3429736793041229, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 290 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.3055964708328247, + "learning_rate": 0.0002, + "loss": 1.8754, + "step": 300 + }, + { + "epoch": 0.4226312201772324, + "grad_norm": 0.33801034092903137, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 310 + }, + { + "epoch": 0.43626448534423995, + "grad_norm": 0.348783016204834, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 320 + }, + { + "epoch": 0.4498977505112474, + "grad_norm": 0.3057514727115631, + "learning_rate": 0.0002, + "loss": 1.8451, + "step": 330 + }, + { + "epoch": 0.46353101567825494, + "grad_norm": 0.3849763572216034, + "learning_rate": 0.0002, + "loss": 1.8766, + "step": 340 + }, + { + "epoch": 0.47716428084526247, + "grad_norm": 0.30080053210258484, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 350 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.3595106303691864, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 360 + }, + { + "epoch": 0.5044308111792775, + "grad_norm": 0.31099820137023926, + "learning_rate": 0.0002, + "loss": 1.8232, + "step": 370 + }, + { + "epoch": 0.5180640763462849, + "grad_norm": 0.3157978355884552, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 380 + }, + { + "epoch": 0.5316973415132924, + "grad_norm": 0.27960965037345886, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 390 + }, + { + "epoch": 0.5453306066803, + "grad_norm": 0.3102385103702545, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 400 + }, + { + "epoch": 0.5589638718473074, + "grad_norm": 0.32828861474990845, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 410 + }, + { + "epoch": 0.5725971370143149, + "grad_norm": 0.29560017585754395, + "learning_rate": 0.0002, + "loss": 1.8165, + "step": 420 + }, + { + "epoch": 0.5862304021813224, + "grad_norm": 0.33316895365715027, + "learning_rate": 0.0002, + "loss": 1.9455, + "step": 430 + }, + { + "epoch": 0.59986366734833, + "grad_norm": 0.30420982837677, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.32619214057922363, + "learning_rate": 0.0002, + "loss": 1.7565, + "step": 450 + }, + { + "epoch": 0.6271301976823449, + "grad_norm": 0.3603750765323639, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 460 + }, + { + "epoch": 0.6407634628493524, + "grad_norm": 0.30834096670150757, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 470 + }, + { + "epoch": 0.65439672801636, + "grad_norm": 0.28756365180015564, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 480 + }, + { + "epoch": 0.6680299931833674, + "grad_norm": 0.2878406345844269, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 490 + }, + { + "epoch": 0.6816632583503749, + "grad_norm": 0.31329697370529175, + "learning_rate": 0.0002, + "loss": 1.8581, + "step": 500 + }, + { + "epoch": 0.6952965235173824, + "grad_norm": 0.3405822515487671, + "learning_rate": 0.0002, + "loss": 1.7886, + "step": 510 + }, + { + "epoch": 0.7089297886843899, + "grad_norm": 0.305560827255249, + "learning_rate": 0.0002, + "loss": 1.778, + "step": 520 + }, + { + "epoch": 0.7225630538513974, + "grad_norm": 0.2973416745662689, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 530 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.327303946018219, + "learning_rate": 0.0002, + "loss": 1.8223, + "step": 540 + }, + { + "epoch": 0.7498295841854125, + "grad_norm": 0.62595534324646, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 550 + }, + { + "epoch": 0.7634628493524199, + "grad_norm": 0.3129784166812897, + "learning_rate": 0.0002, + "loss": 1.7466, + "step": 560 + }, + { + "epoch": 0.7770961145194274, + "grad_norm": 0.32496583461761475, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 570 + }, + { + "epoch": 0.7907293796864349, + "grad_norm": 0.3098868131637573, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 580 + }, + { + "epoch": 0.8043626448534424, + "grad_norm": 0.30726853013038635, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 590 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.2964220643043518, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 600 + }, + { + "epoch": 0.8316291751874574, + "grad_norm": 0.32352274656295776, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 610 + }, + { + "epoch": 0.8452624403544649, + "grad_norm": 0.2938912510871887, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 620 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.295559823513031, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 630 + }, + { + "epoch": 0.8725289706884799, + "grad_norm": 0.34102028608322144, + "learning_rate": 0.0002, + "loss": 1.9, + "step": 640 + }, + { + "epoch": 0.8861622358554874, + "grad_norm": 0.29676181077957153, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.8997955010224948, + "grad_norm": 0.3108902871608734, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 660 + }, + { + "epoch": 0.9134287661895024, + "grad_norm": 0.2690821588039398, + "learning_rate": 0.0002, + "loss": 1.7955, + "step": 670 + }, + { + "epoch": 0.9270620313565099, + "grad_norm": 0.32752540707588196, + "learning_rate": 0.0002, + "loss": 1.7881, + "step": 680 + }, + { + "epoch": 0.9406952965235174, + "grad_norm": 0.8029476404190063, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 690 + }, + { + "epoch": 0.9543285616905249, + "grad_norm": 0.30534422397613525, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 700 + }, + { + "epoch": 0.9679618268575324, + "grad_norm": 0.2899954319000244, + "learning_rate": 0.0002, + "loss": 1.7614, + "step": 710 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.28814372420310974, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 720 + }, + { + "epoch": 0.9952283571915473, + "grad_norm": 0.3061596751213074, + "learning_rate": 0.0002, + "loss": 1.8865, + "step": 730 + }, + { + "epoch": 0.9993183367416496, + "eval_loss": 1.8171186447143555, + "eval_runtime": 53.6047, + "eval_samples_per_second": 9.458, + "eval_steps_per_second": 1.194, + "step": 733 + } + ], + "logging_steps": 10, + "max_steps": 5864, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.767530741825536e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..da7142eb13ed7f8418e5055c63a0fe0ca5e1972b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8beac9fdfb91726fdf7473c9e77541aa988c61dc8beaba03293eafbe9c0a376 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1c955501d718f7a502d8a7a54a70cdf1bf28999c --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl @@ -0,0 +1,10 @@ +{"epoch": 0.9993183367416496, "step": 733, "epoch_duration": 5468.251661539078, "total_accumulated_duration": 5468.251661539078, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.1008, "grad_norm": 0.7675835490226746, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5292, "grad_norm": 0.5283138751983643, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3126, "grad_norm": 0.5684153437614441, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 1.9885, "grad_norm": 0.4469817578792572, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.922, "grad_norm": 0.4745156168937683, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9318, "grad_norm": 0.46541205048561096, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9332, "grad_norm": 0.42755699157714844, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8678, "grad_norm": 0.46226418018341064, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9255, "grad_norm": 0.4570668637752533, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7914, "grad_norm": 0.4365231394767761, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8549, "grad_norm": 0.38550880551338196, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9213, "grad_norm": 0.3430306017398834, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.9301, "grad_norm": 0.4018137454986572, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7552, "grad_norm": 0.3614208996295929, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8748, "grad_norm": 0.35473498702049255, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8337, "grad_norm": 0.35780423879623413, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.8387, "grad_norm": 0.3678103983402252, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8921, "grad_norm": 0.3819291293621063, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8451, "grad_norm": 0.3084178864955902, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9236, "grad_norm": 0.3720385432243347, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.783, "grad_norm": 0.3562837541103363, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.7199, "grad_norm": 0.32655608654022217, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8723, "grad_norm": 0.36233681440353394, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9256, "grad_norm": 0.32759809494018555, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9013, "grad_norm": 0.4187735915184021, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8576, "grad_norm": 0.37079235911369324, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8521, "grad_norm": 0.383121520280838, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8569, "grad_norm": 0.3556862771511078, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8037, "grad_norm": 0.3265869915485382, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.874, "grad_norm": 0.30522024631500244, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8346, "grad_norm": 0.33467918634414673, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7928, "grad_norm": 0.498266339302063, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3135899305343628, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8759, "grad_norm": 0.39500781893730164, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8084, "grad_norm": 0.3009391129016876, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8536, "grad_norm": 0.36251890659332275, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8244, "grad_norm": 0.31266191601753235, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7026, "grad_norm": 0.3183344900608063, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8262, "grad_norm": 0.28234341740608215, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7407, "grad_norm": 0.3185858726501465, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7446, "grad_norm": 0.4044371247291565, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8182, "grad_norm": 0.29381003975868225, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9416, "grad_norm": 0.3450601398944855, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.82, "grad_norm": 0.3334464728832245, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7556, "grad_norm": 0.3518504500389099, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7957, "grad_norm": 0.3641590178012848, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7757, "grad_norm": 0.36691612005233765, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.805, "grad_norm": 0.28644075989723206, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.7429, "grad_norm": 0.29623064398765564, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8579, "grad_norm": 0.30514174699783325, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7897, "grad_norm": 0.33950939774513245, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.7791, "grad_norm": 0.3081190288066864, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.76, "grad_norm": 0.29419979453086853, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8211, "grad_norm": 0.3383752107620239, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8593, "grad_norm": 0.2980792820453644, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3419261872768402, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8007, "grad_norm": 0.31977590918540955, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7766, "grad_norm": 0.3168697655200958, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7201, "grad_norm": 0.31143882870674133, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7912, "grad_norm": 0.29171547293663025, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8147, "grad_norm": 0.3080858886241913, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.81, "grad_norm": 0.2734144926071167, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7716, "grad_norm": 0.2898387014865875, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9015, "grad_norm": 0.34029147028923035, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8012, "grad_norm": 0.28186970949172974, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8097, "grad_norm": 0.3147708773612976, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7954, "grad_norm": 0.26419591903686523, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7877, "grad_norm": 0.3258209824562073, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7631, "grad_norm": 0.272344708442688, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7767, "grad_norm": 0.3036558926105499, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.76, "grad_norm": 0.2845146059989929, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7801, "grad_norm": 0.28313693404197693, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8849, "grad_norm": 0.3092978596687317, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}]} +{"epoch": 0.9993183367416496, "step": 733, "epoch_duration": 2911.6444346904755, "total_accumulated_duration": 2911.6444346904755, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0948, "grad_norm": 0.7837588787078857, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5115, "grad_norm": 0.5483142733573914, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3159, "grad_norm": 0.5654940605163574, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 1.9985, "grad_norm": 0.47807976603507996, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9277, "grad_norm": 0.5375542044639587, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9403, "grad_norm": 0.4766482412815094, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9332, "grad_norm": 0.39935776591300964, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.866, "grad_norm": 0.45399025082588196, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9304, "grad_norm": 0.46186086535453796, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.795, "grad_norm": 0.43516600131988525, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8606, "grad_norm": 0.3928009569644928, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9286, "grad_norm": 0.3584233522415161, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.9304, "grad_norm": 0.3833671808242798, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7561, "grad_norm": 0.36329957842826843, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8758, "grad_norm": 0.35854387283325195, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8374, "grad_norm": 0.36146774888038635, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.8389, "grad_norm": 0.3533250093460083, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8929, "grad_norm": 0.36693814396858215, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.847, "grad_norm": 0.3156248927116394, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.925, "grad_norm": 0.34866863489151, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7853, "grad_norm": 0.36728984117507935, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3290116786956787, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8711, "grad_norm": 0.4102158844470978, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9289, "grad_norm": 0.3361101746559143, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9025, "grad_norm": 0.4293116629123688, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.86, "grad_norm": 0.36179205775260925, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8537, "grad_norm": 0.3195672929286957, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8587, "grad_norm": 0.373018354177475, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8016, "grad_norm": 0.33340632915496826, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8732, "grad_norm": 0.29840829968452454, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8369, "grad_norm": 0.35418516397476196, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7929, "grad_norm": 0.38273194432258606, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8441, "grad_norm": 0.30469033122062683, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8736, "grad_norm": 0.3807629644870758, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8064, "grad_norm": 0.2952139377593994, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8532, "grad_norm": 0.358732134103775, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8269, "grad_norm": 0.3062337636947632, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.704, "grad_norm": 0.3123418986797333, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8267, "grad_norm": 0.2843833267688751, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7403, "grad_norm": 0.36187559366226196, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7429, "grad_norm": 0.3211730420589447, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8191, "grad_norm": 0.31190505623817444, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.946, "grad_norm": 0.44237005710601807, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8204, "grad_norm": 0.30353036522865295, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7575, "grad_norm": 0.3608725666999817, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.795, "grad_norm": 0.30137816071510315, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7759, "grad_norm": 0.31448695063591003, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8057, "grad_norm": 0.2958511412143707, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.7433, "grad_norm": 0.28700390458106995, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8576, "grad_norm": 0.31016767024993896, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7904, "grad_norm": 0.33729124069213867, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.7786, "grad_norm": 0.32536232471466064, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7588, "grad_norm": 0.30095645785331726, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8186, "grad_norm": 0.3496246635913849, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8587, "grad_norm": 0.30101504921913147, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7467, "grad_norm": 0.3162820637226105, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.7999, "grad_norm": 0.31092569231987, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7794, "grad_norm": 0.31278666853904724, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7213, "grad_norm": 0.30959561467170715, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7884, "grad_norm": 0.3000122606754303, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8163, "grad_norm": 0.31050294637680054, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.8064, "grad_norm": 0.27638447284698486, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7691, "grad_norm": 0.2875880300998688, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9008, "grad_norm": 0.42822250723838806, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.7996, "grad_norm": 0.28577205538749695, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8084, "grad_norm": 0.30910760164260864, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7981, "grad_norm": 0.2699502408504486, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7894, "grad_norm": 0.4193672835826874, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7638, "grad_norm": 0.29555195569992065, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7787, "grad_norm": 0.29588839411735535, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7607, "grad_norm": 0.28271564841270447, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7813, "grad_norm": 0.2848884165287018, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8849, "grad_norm": 0.3218976557254791, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}]} +{"epoch": 0.9993183367416496, "step": 733, "epoch_duration": 1097.0387206077576, "total_accumulated_duration": 1097.0387206077576, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}]} +{"epoch": 2.0, "step": 1467, "epoch_duration": 1095.1415996551514, "total_accumulated_duration": 2192.180320262909, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}]} +{"epoch": 2.9993183367416494, "step": 2200, "epoch_duration": 1094.913452386856, "total_accumulated_duration": 3287.093772649765, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}]} +{"epoch": 4.0, "step": 2934, "epoch_duration": 1094.484756231308, "total_accumulated_duration": 4381.578528881073, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}, {"eval_loss": 1.8527295589447021, "eval_runtime": 53.6403, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 2.9993183367416494, "step": 2200}, {"loss": 1.4719, "grad_norm": 0.5530241131782532, "learning_rate": 0.0002, "epoch": 3.0129516019086573, "step": 2210}, {"loss": 1.4088, "grad_norm": 0.6642216444015503, "learning_rate": 0.0002, "epoch": 3.0265848670756648, "step": 2220}, {"loss": 1.4382, "grad_norm": 0.61470627784729, "learning_rate": 0.0002, "epoch": 3.0402181322426722, "step": 2230}, {"loss": 1.4634, "grad_norm": 0.8559566140174866, "learning_rate": 0.0002, "epoch": 3.0538513974096797, "step": 2240}, {"loss": 1.3854, "grad_norm": 0.7015801668167114, "learning_rate": 0.0002, "epoch": 3.067484662576687, "step": 2250}, {"loss": 1.4981, "grad_norm": 0.7226442694664001, "learning_rate": 0.0002, "epoch": 3.0811179277436946, "step": 2260}, {"loss": 1.4143, "grad_norm": 0.7560588717460632, "learning_rate": 0.0002, "epoch": 3.094751192910702, "step": 2270}, {"loss": 1.4395, "grad_norm": 0.6216568946838379, "learning_rate": 0.0002, "epoch": 3.1083844580777096, "step": 2280}, {"loss": 1.3842, "grad_norm": 0.6768500804901123, "learning_rate": 0.0002, "epoch": 3.122017723244717, "step": 2290}, {"loss": 1.4672, "grad_norm": 0.7028762102127075, "learning_rate": 0.0002, "epoch": 3.1356509884117245, "step": 2300}, {"loss": 1.3826, "grad_norm": 0.6329697966575623, "learning_rate": 0.0002, "epoch": 3.149284253578732, "step": 2310}, {"loss": 1.442, "grad_norm": 0.6328264474868774, "learning_rate": 0.0002, "epoch": 3.1629175187457395, "step": 2320}, {"loss": 1.3762, "grad_norm": 0.7573632001876831, "learning_rate": 0.0002, "epoch": 3.176550783912747, "step": 2330}, {"loss": 1.3553, "grad_norm": 0.595740795135498, "learning_rate": 0.0002, "epoch": 3.190184049079755, "step": 2340}, {"loss": 1.3953, "grad_norm": 0.7111806869506836, "learning_rate": 0.0002, "epoch": 3.2038173142467623, "step": 2350}, {"loss": 1.3797, "grad_norm": 0.6328730583190918, "learning_rate": 0.0002, "epoch": 3.2174505794137698, "step": 2360}, {"loss": 1.3855, "grad_norm": 0.5860254168510437, "learning_rate": 0.0002, "epoch": 3.2310838445807772, "step": 2370}, {"loss": 1.4267, "grad_norm": 0.7387157082557678, "learning_rate": 0.0002, "epoch": 3.2447171097477847, "step": 2380}, {"loss": 1.4837, "grad_norm": 0.6897673606872559, "learning_rate": 0.0002, "epoch": 3.258350374914792, "step": 2390}, {"loss": 1.4372, "grad_norm": 0.7157699465751648, "learning_rate": 0.0002, "epoch": 3.2719836400817996, "step": 2400}, {"loss": 1.4432, "grad_norm": 0.6422511339187622, "learning_rate": 0.0002, "epoch": 3.285616905248807, "step": 2410}, {"loss": 1.4828, "grad_norm": 1.0481886863708496, "learning_rate": 0.0002, "epoch": 3.2992501704158146, "step": 2420}, {"loss": 1.4473, "grad_norm": 0.7050786018371582, "learning_rate": 0.0002, "epoch": 3.312883435582822, "step": 2430}, {"loss": 1.3465, "grad_norm": 0.6090759038925171, "learning_rate": 0.0002, "epoch": 3.3265167007498295, "step": 2440}, {"loss": 1.4619, "grad_norm": 0.6626465320587158, "learning_rate": 0.0002, "epoch": 3.340149965916837, "step": 2450}, {"loss": 1.4512, "grad_norm": 0.6565486788749695, "learning_rate": 0.0002, "epoch": 3.3537832310838445, "step": 2460}, {"loss": 1.588, "grad_norm": 0.6449528932571411, "learning_rate": 0.0002, "epoch": 3.367416496250852, "step": 2470}, {"loss": 1.4773, "grad_norm": 0.7746227383613586, "learning_rate": 0.0002, "epoch": 3.3810497614178594, "step": 2480}, {"loss": 1.417, "grad_norm": 0.7074846029281616, "learning_rate": 0.0002, "epoch": 3.3946830265848673, "step": 2490}, {"loss": 1.4476, "grad_norm": 0.6547690033912659, "learning_rate": 0.0002, "epoch": 3.4083162917518743, "step": 2500}, {"loss": 1.4074, "grad_norm": 0.784721314907074, "learning_rate": 0.0002, "epoch": 3.4219495569188823, "step": 2510}, {"loss": 1.4326, "grad_norm": 0.7270277738571167, "learning_rate": 0.0002, "epoch": 3.4355828220858897, "step": 2520}, {"loss": 1.4354, "grad_norm": 0.67588871717453, "learning_rate": 0.0002, "epoch": 3.449216087252897, "step": 2530}, {"loss": 1.4074, "grad_norm": 0.6768023371696472, "learning_rate": 0.0002, "epoch": 3.4628493524199047, "step": 2540}, {"loss": 1.4863, "grad_norm": 0.7026481628417969, "learning_rate": 0.0002, "epoch": 3.476482617586912, "step": 2550}, {"loss": 1.468, "grad_norm": 0.646075963973999, "learning_rate": 0.0002, "epoch": 3.4901158827539196, "step": 2560}, {"loss": 1.4058, "grad_norm": 0.6288973689079285, "learning_rate": 0.0002, "epoch": 3.503749147920927, "step": 2570}, {"loss": 1.4613, "grad_norm": 0.6440825462341309, "learning_rate": 0.0002, "epoch": 3.5173824130879345, "step": 2580}, {"loss": 1.3808, "grad_norm": 0.7074111700057983, "learning_rate": 0.0002, "epoch": 3.531015678254942, "step": 2590}, {"loss": 1.4901, "grad_norm": 0.7007562518119812, "learning_rate": 0.0002, "epoch": 3.5446489434219495, "step": 2600}, {"loss": 1.4511, "grad_norm": 0.6045376658439636, "learning_rate": 0.0002, "epoch": 3.558282208588957, "step": 2610}, {"loss": 1.4596, "grad_norm": 0.9149952530860901, "learning_rate": 0.0002, "epoch": 3.5719154737559644, "step": 2620}, {"loss": 1.4355, "grad_norm": 0.6490362882614136, "learning_rate": 0.0002, "epoch": 3.585548738922972, "step": 2630}, {"loss": 1.4107, "grad_norm": 0.6552226543426514, "learning_rate": 0.0002, "epoch": 3.59918200408998, "step": 2640}, {"loss": 1.433, "grad_norm": 0.6541850566864014, "learning_rate": 0.0002, "epoch": 3.612815269256987, "step": 2650}, {"loss": 1.4279, "grad_norm": 0.6500770449638367, "learning_rate": 0.0002, "epoch": 3.6264485344239947, "step": 2660}, {"loss": 1.3929, "grad_norm": 0.6345893740653992, "learning_rate": 0.0002, "epoch": 3.640081799591002, "step": 2670}, {"loss": 1.3634, "grad_norm": 0.6382275223731995, "learning_rate": 0.0002, "epoch": 3.6537150647580097, "step": 2680}, {"loss": 1.4478, "grad_norm": 0.6738566160202026, "learning_rate": 0.0002, "epoch": 3.667348329925017, "step": 2690}, {"loss": 1.4642, "grad_norm": 0.7446315288543701, "learning_rate": 0.0002, "epoch": 3.6809815950920246, "step": 2700}, {"loss": 1.4342, "grad_norm": 0.6717571020126343, "learning_rate": 0.0002, "epoch": 3.694614860259032, "step": 2710}, {"loss": 1.4285, "grad_norm": 0.667259693145752, "learning_rate": 0.0002, "epoch": 3.7082481254260395, "step": 2720}, {"loss": 1.5389, "grad_norm": 0.6808622479438782, "learning_rate": 0.0002, "epoch": 3.721881390593047, "step": 2730}, {"loss": 1.4297, "grad_norm": 0.7254287004470825, "learning_rate": 0.0002, "epoch": 3.7355146557600545, "step": 2740}, {"loss": 1.4176, "grad_norm": 0.6864007711410522, "learning_rate": 0.0002, "epoch": 3.749147920927062, "step": 2750}, {"loss": 1.4811, "grad_norm": 0.7041361331939697, "learning_rate": 0.0002, "epoch": 3.7627811860940694, "step": 2760}, {"loss": 1.4284, "grad_norm": 0.6559903025627136, "learning_rate": 0.0002, "epoch": 3.776414451261077, "step": 2770}, {"loss": 1.4608, "grad_norm": 0.6602269411087036, "learning_rate": 0.0002, "epoch": 3.7900477164280844, "step": 2780}, {"loss": 1.4588, "grad_norm": 0.692611813545227, "learning_rate": 0.0002, "epoch": 3.8036809815950923, "step": 2790}, {"loss": 1.4065, "grad_norm": 0.7051475644111633, "learning_rate": 0.0002, "epoch": 3.8173142467620993, "step": 2800}, {"loss": 1.4083, "grad_norm": 0.6685371398925781, "learning_rate": 0.0002, "epoch": 3.830947511929107, "step": 2810}, {"loss": 1.5227, "grad_norm": 0.6706477403640747, "learning_rate": 0.0002, "epoch": 3.8445807770961147, "step": 2820}, {"loss": 1.4076, "grad_norm": 0.6671637296676636, "learning_rate": 0.0002, "epoch": 3.858214042263122, "step": 2830}, {"loss": 1.4736, "grad_norm": 0.694092333316803, "learning_rate": 0.0002, "epoch": 3.8718473074301296, "step": 2840}, {"loss": 1.4161, "grad_norm": 0.7349600195884705, "learning_rate": 0.0002, "epoch": 3.885480572597137, "step": 2850}, {"loss": 1.4617, "grad_norm": 0.6647971868515015, "learning_rate": 0.0002, "epoch": 3.8991138377641446, "step": 2860}, {"loss": 1.5046, "grad_norm": 0.806656539440155, "learning_rate": 0.0002, "epoch": 3.912747102931152, "step": 2870}, {"loss": 1.428, "grad_norm": 0.6008772850036621, "learning_rate": 0.0002, "epoch": 3.9263803680981595, "step": 2880}, {"loss": 1.4116, "grad_norm": 0.659227728843689, "learning_rate": 0.0002, "epoch": 3.940013633265167, "step": 2890}, {"loss": 1.4136, "grad_norm": 0.6357656717300415, "learning_rate": 0.0002, "epoch": 3.9536468984321744, "step": 2900}, {"loss": 1.4655, "grad_norm": 0.6541687846183777, "learning_rate": 0.0002, "epoch": 3.967280163599182, "step": 2910}, {"loss": 1.4854, "grad_norm": 0.6090909838676453, "learning_rate": 0.0002, "epoch": 3.9809134287661894, "step": 2920}, {"loss": 1.4615, "grad_norm": 0.7198411822319031, "learning_rate": 0.0002, "epoch": 3.994546693933197, "step": 2930}]} +{"epoch": 4.999318336741649, "step": 3667, "epoch_duration": 1095.6056084632874, "total_accumulated_duration": 5477.18413734436, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}, {"eval_loss": 1.8527295589447021, "eval_runtime": 53.6403, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 2.9993183367416494, "step": 2200}, {"loss": 1.4719, "grad_norm": 0.5530241131782532, "learning_rate": 0.0002, "epoch": 3.0129516019086573, "step": 2210}, {"loss": 1.4088, "grad_norm": 0.6642216444015503, "learning_rate": 0.0002, "epoch": 3.0265848670756648, "step": 2220}, {"loss": 1.4382, "grad_norm": 0.61470627784729, "learning_rate": 0.0002, "epoch": 3.0402181322426722, "step": 2230}, {"loss": 1.4634, "grad_norm": 0.8559566140174866, "learning_rate": 0.0002, "epoch": 3.0538513974096797, "step": 2240}, {"loss": 1.3854, "grad_norm": 0.7015801668167114, "learning_rate": 0.0002, "epoch": 3.067484662576687, "step": 2250}, {"loss": 1.4981, "grad_norm": 0.7226442694664001, "learning_rate": 0.0002, "epoch": 3.0811179277436946, "step": 2260}, {"loss": 1.4143, "grad_norm": 0.7560588717460632, "learning_rate": 0.0002, "epoch": 3.094751192910702, "step": 2270}, {"loss": 1.4395, "grad_norm": 0.6216568946838379, "learning_rate": 0.0002, "epoch": 3.1083844580777096, "step": 2280}, {"loss": 1.3842, "grad_norm": 0.6768500804901123, "learning_rate": 0.0002, "epoch": 3.122017723244717, "step": 2290}, {"loss": 1.4672, "grad_norm": 0.7028762102127075, "learning_rate": 0.0002, "epoch": 3.1356509884117245, "step": 2300}, {"loss": 1.3826, "grad_norm": 0.6329697966575623, "learning_rate": 0.0002, "epoch": 3.149284253578732, "step": 2310}, {"loss": 1.442, "grad_norm": 0.6328264474868774, "learning_rate": 0.0002, "epoch": 3.1629175187457395, "step": 2320}, {"loss": 1.3762, "grad_norm": 0.7573632001876831, "learning_rate": 0.0002, "epoch": 3.176550783912747, "step": 2330}, {"loss": 1.3553, "grad_norm": 0.595740795135498, "learning_rate": 0.0002, "epoch": 3.190184049079755, "step": 2340}, {"loss": 1.3953, "grad_norm": 0.7111806869506836, "learning_rate": 0.0002, "epoch": 3.2038173142467623, "step": 2350}, {"loss": 1.3797, "grad_norm": 0.6328730583190918, "learning_rate": 0.0002, "epoch": 3.2174505794137698, "step": 2360}, {"loss": 1.3855, "grad_norm": 0.5860254168510437, "learning_rate": 0.0002, "epoch": 3.2310838445807772, "step": 2370}, {"loss": 1.4267, "grad_norm": 0.7387157082557678, "learning_rate": 0.0002, "epoch": 3.2447171097477847, "step": 2380}, {"loss": 1.4837, "grad_norm": 0.6897673606872559, "learning_rate": 0.0002, "epoch": 3.258350374914792, "step": 2390}, {"loss": 1.4372, "grad_norm": 0.7157699465751648, "learning_rate": 0.0002, "epoch": 3.2719836400817996, "step": 2400}, {"loss": 1.4432, "grad_norm": 0.6422511339187622, "learning_rate": 0.0002, "epoch": 3.285616905248807, "step": 2410}, {"loss": 1.4828, "grad_norm": 1.0481886863708496, "learning_rate": 0.0002, "epoch": 3.2992501704158146, "step": 2420}, {"loss": 1.4473, "grad_norm": 0.7050786018371582, "learning_rate": 0.0002, "epoch": 3.312883435582822, "step": 2430}, {"loss": 1.3465, "grad_norm": 0.6090759038925171, "learning_rate": 0.0002, "epoch": 3.3265167007498295, "step": 2440}, {"loss": 1.4619, "grad_norm": 0.6626465320587158, "learning_rate": 0.0002, "epoch": 3.340149965916837, "step": 2450}, {"loss": 1.4512, "grad_norm": 0.6565486788749695, "learning_rate": 0.0002, "epoch": 3.3537832310838445, "step": 2460}, {"loss": 1.588, "grad_norm": 0.6449528932571411, "learning_rate": 0.0002, "epoch": 3.367416496250852, "step": 2470}, {"loss": 1.4773, "grad_norm": 0.7746227383613586, "learning_rate": 0.0002, "epoch": 3.3810497614178594, "step": 2480}, {"loss": 1.417, "grad_norm": 0.7074846029281616, "learning_rate": 0.0002, "epoch": 3.3946830265848673, "step": 2490}, {"loss": 1.4476, "grad_norm": 0.6547690033912659, "learning_rate": 0.0002, "epoch": 3.4083162917518743, "step": 2500}, {"loss": 1.4074, "grad_norm": 0.784721314907074, "learning_rate": 0.0002, "epoch": 3.4219495569188823, "step": 2510}, {"loss": 1.4326, "grad_norm": 0.7270277738571167, "learning_rate": 0.0002, "epoch": 3.4355828220858897, "step": 2520}, {"loss": 1.4354, "grad_norm": 0.67588871717453, "learning_rate": 0.0002, "epoch": 3.449216087252897, "step": 2530}, {"loss": 1.4074, "grad_norm": 0.6768023371696472, "learning_rate": 0.0002, "epoch": 3.4628493524199047, "step": 2540}, {"loss": 1.4863, "grad_norm": 0.7026481628417969, "learning_rate": 0.0002, "epoch": 3.476482617586912, "step": 2550}, {"loss": 1.468, "grad_norm": 0.646075963973999, "learning_rate": 0.0002, "epoch": 3.4901158827539196, "step": 2560}, {"loss": 1.4058, "grad_norm": 0.6288973689079285, "learning_rate": 0.0002, "epoch": 3.503749147920927, "step": 2570}, {"loss": 1.4613, "grad_norm": 0.6440825462341309, "learning_rate": 0.0002, "epoch": 3.5173824130879345, "step": 2580}, {"loss": 1.3808, "grad_norm": 0.7074111700057983, "learning_rate": 0.0002, "epoch": 3.531015678254942, "step": 2590}, {"loss": 1.4901, "grad_norm": 0.7007562518119812, "learning_rate": 0.0002, "epoch": 3.5446489434219495, "step": 2600}, {"loss": 1.4511, "grad_norm": 0.6045376658439636, "learning_rate": 0.0002, "epoch": 3.558282208588957, "step": 2610}, {"loss": 1.4596, "grad_norm": 0.9149952530860901, "learning_rate": 0.0002, "epoch": 3.5719154737559644, "step": 2620}, {"loss": 1.4355, "grad_norm": 0.6490362882614136, "learning_rate": 0.0002, "epoch": 3.585548738922972, "step": 2630}, {"loss": 1.4107, "grad_norm": 0.6552226543426514, "learning_rate": 0.0002, "epoch": 3.59918200408998, "step": 2640}, {"loss": 1.433, "grad_norm": 0.6541850566864014, "learning_rate": 0.0002, "epoch": 3.612815269256987, "step": 2650}, {"loss": 1.4279, "grad_norm": 0.6500770449638367, "learning_rate": 0.0002, "epoch": 3.6264485344239947, "step": 2660}, {"loss": 1.3929, "grad_norm": 0.6345893740653992, "learning_rate": 0.0002, "epoch": 3.640081799591002, "step": 2670}, {"loss": 1.3634, "grad_norm": 0.6382275223731995, "learning_rate": 0.0002, "epoch": 3.6537150647580097, "step": 2680}, {"loss": 1.4478, "grad_norm": 0.6738566160202026, "learning_rate": 0.0002, "epoch": 3.667348329925017, "step": 2690}, {"loss": 1.4642, "grad_norm": 0.7446315288543701, "learning_rate": 0.0002, "epoch": 3.6809815950920246, "step": 2700}, {"loss": 1.4342, "grad_norm": 0.6717571020126343, "learning_rate": 0.0002, "epoch": 3.694614860259032, "step": 2710}, {"loss": 1.4285, "grad_norm": 0.667259693145752, "learning_rate": 0.0002, "epoch": 3.7082481254260395, "step": 2720}, {"loss": 1.5389, "grad_norm": 0.6808622479438782, "learning_rate": 0.0002, "epoch": 3.721881390593047, "step": 2730}, {"loss": 1.4297, "grad_norm": 0.7254287004470825, "learning_rate": 0.0002, "epoch": 3.7355146557600545, "step": 2740}, {"loss": 1.4176, "grad_norm": 0.6864007711410522, "learning_rate": 0.0002, "epoch": 3.749147920927062, "step": 2750}, {"loss": 1.4811, "grad_norm": 0.7041361331939697, "learning_rate": 0.0002, "epoch": 3.7627811860940694, "step": 2760}, {"loss": 1.4284, "grad_norm": 0.6559903025627136, "learning_rate": 0.0002, "epoch": 3.776414451261077, "step": 2770}, {"loss": 1.4608, "grad_norm": 0.6602269411087036, "learning_rate": 0.0002, "epoch": 3.7900477164280844, "step": 2780}, {"loss": 1.4588, "grad_norm": 0.692611813545227, "learning_rate": 0.0002, "epoch": 3.8036809815950923, "step": 2790}, {"loss": 1.4065, "grad_norm": 0.7051475644111633, "learning_rate": 0.0002, "epoch": 3.8173142467620993, "step": 2800}, {"loss": 1.4083, "grad_norm": 0.6685371398925781, "learning_rate": 0.0002, "epoch": 3.830947511929107, "step": 2810}, {"loss": 1.5227, "grad_norm": 0.6706477403640747, "learning_rate": 0.0002, "epoch": 3.8445807770961147, "step": 2820}, {"loss": 1.4076, "grad_norm": 0.6671637296676636, "learning_rate": 0.0002, "epoch": 3.858214042263122, "step": 2830}, {"loss": 1.4736, "grad_norm": 0.694092333316803, "learning_rate": 0.0002, "epoch": 3.8718473074301296, "step": 2840}, {"loss": 1.4161, "grad_norm": 0.7349600195884705, "learning_rate": 0.0002, "epoch": 3.885480572597137, "step": 2850}, {"loss": 1.4617, "grad_norm": 0.6647971868515015, "learning_rate": 0.0002, "epoch": 3.8991138377641446, "step": 2860}, {"loss": 1.5046, "grad_norm": 0.806656539440155, "learning_rate": 0.0002, "epoch": 3.912747102931152, "step": 2870}, {"loss": 1.428, "grad_norm": 0.6008772850036621, "learning_rate": 0.0002, "epoch": 3.9263803680981595, "step": 2880}, {"loss": 1.4116, "grad_norm": 0.659227728843689, "learning_rate": 0.0002, "epoch": 3.940013633265167, "step": 2890}, {"loss": 1.4136, "grad_norm": 0.6357656717300415, "learning_rate": 0.0002, "epoch": 3.9536468984321744, "step": 2900}, {"loss": 1.4655, "grad_norm": 0.6541687846183777, "learning_rate": 0.0002, "epoch": 3.967280163599182, "step": 2910}, {"loss": 1.4854, "grad_norm": 0.6090909838676453, "learning_rate": 0.0002, "epoch": 3.9809134287661894, "step": 2920}, {"loss": 1.4615, "grad_norm": 0.7198411822319031, "learning_rate": 0.0002, "epoch": 3.994546693933197, "step": 2930}, {"eval_loss": 1.9278366565704346, "eval_runtime": 53.6567, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 2934}, {"loss": 1.3159, "grad_norm": 0.6498575210571289, "learning_rate": 0.0002, "epoch": 4.008179959100205, "step": 2940}, {"loss": 1.2075, "grad_norm": 0.865602433681488, "learning_rate": 0.0002, "epoch": 4.021813224267212, "step": 2950}, {"loss": 1.1744, "grad_norm": 0.8514999151229858, "learning_rate": 0.0002, "epoch": 4.03544648943422, "step": 2960}, {"loss": 1.1553, "grad_norm": 1.0677322149276733, "learning_rate": 0.0002, "epoch": 4.049079754601227, "step": 2970}, {"loss": 1.1962, "grad_norm": 1.0126488208770752, "learning_rate": 0.0002, "epoch": 4.062713019768235, "step": 2980}, {"loss": 1.1631, "grad_norm": 1.0008870363235474, "learning_rate": 0.0002, "epoch": 4.076346284935242, "step": 2990}, {"loss": 1.2154, "grad_norm": 0.7942054271697998, "learning_rate": 0.0002, "epoch": 4.08997955010225, "step": 3000}, {"loss": 1.214, "grad_norm": 1.0482100248336792, "learning_rate": 0.0002, "epoch": 4.103612815269257, "step": 3010}, {"loss": 1.1999, "grad_norm": 1.0516992807388306, "learning_rate": 0.0002, "epoch": 4.1172460804362645, "step": 3020}, {"loss": 1.2108, "grad_norm": 0.8144322037696838, "learning_rate": 0.0002, "epoch": 4.130879345603272, "step": 3030}, {"loss": 1.1782, "grad_norm": 0.952297568321228, "learning_rate": 0.0002, "epoch": 4.144512610770279, "step": 3040}, {"loss": 1.2814, "grad_norm": 1.007645606994629, "learning_rate": 0.0002, "epoch": 4.158145875937287, "step": 3050}, {"loss": 1.1731, "grad_norm": 1.0480353832244873, "learning_rate": 0.0002, "epoch": 4.171779141104294, "step": 3060}, {"loss": 1.196, "grad_norm": 0.9270663857460022, "learning_rate": 0.0002, "epoch": 4.185412406271302, "step": 3070}, {"loss": 1.2167, "grad_norm": 1.3415262699127197, "learning_rate": 0.0002, "epoch": 4.199045671438309, "step": 3080}, {"loss": 1.2601, "grad_norm": 1.167606234550476, "learning_rate": 0.0002, "epoch": 4.212678936605317, "step": 3090}, {"loss": 1.2605, "grad_norm": 0.9418690800666809, "learning_rate": 0.0002, "epoch": 4.226312201772324, "step": 3100}, {"loss": 1.2184, "grad_norm": 1.0885876417160034, "learning_rate": 0.0002, "epoch": 4.239945466939332, "step": 3110}, {"loss": 1.2594, "grad_norm": 0.9165483713150024, "learning_rate": 0.0002, "epoch": 4.253578732106339, "step": 3120}, {"loss": 1.2933, "grad_norm": 0.9154694080352783, "learning_rate": 0.0002, "epoch": 4.267211997273347, "step": 3130}, {"loss": 1.2584, "grad_norm": 1.100580096244812, "learning_rate": 0.0002, "epoch": 4.280845262440354, "step": 3140}, {"loss": 1.251, "grad_norm": 0.9367576241493225, "learning_rate": 0.0002, "epoch": 4.294478527607362, "step": 3150}, {"loss": 1.2032, "grad_norm": 0.9744015336036682, "learning_rate": 0.0002, "epoch": 4.308111792774369, "step": 3160}, {"loss": 1.2787, "grad_norm": 0.9865175485610962, "learning_rate": 0.0002, "epoch": 4.321745057941377, "step": 3170}, {"loss": 1.2161, "grad_norm": 1.0124907493591309, "learning_rate": 0.0002, "epoch": 4.335378323108385, "step": 3180}, {"loss": 1.2452, "grad_norm": 1.1044819355010986, "learning_rate": 0.0002, "epoch": 4.349011588275392, "step": 3190}, {"loss": 1.2483, "grad_norm": 0.9305577278137207, "learning_rate": 0.0002, "epoch": 4.3626448534424, "step": 3200}, {"loss": 1.2101, "grad_norm": 0.969265341758728, "learning_rate": 0.0002, "epoch": 4.376278118609407, "step": 3210}, {"loss": 1.2355, "grad_norm": 1.0671923160552979, "learning_rate": 0.0002, "epoch": 4.389911383776415, "step": 3220}, {"loss": 1.2259, "grad_norm": 0.9440539479255676, "learning_rate": 0.0002, "epoch": 4.403544648943422, "step": 3230}, {"loss": 1.1706, "grad_norm": 0.9824562668800354, "learning_rate": 0.0002, "epoch": 4.41717791411043, "step": 3240}, {"loss": 1.2234, "grad_norm": 1.0245535373687744, "learning_rate": 0.0002, "epoch": 4.430811179277437, "step": 3250}, {"loss": 1.2713, "grad_norm": 0.9629312753677368, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 3260}, {"loss": 1.2689, "grad_norm": 1.1556470394134521, "learning_rate": 0.0002, "epoch": 4.458077709611452, "step": 3270}, {"loss": 1.2214, "grad_norm": 0.9796679019927979, "learning_rate": 0.0002, "epoch": 4.47171097477846, "step": 3280}, {"loss": 1.2823, "grad_norm": 0.9030535221099854, "learning_rate": 0.0002, "epoch": 4.485344239945467, "step": 3290}, {"loss": 1.2111, "grad_norm": 0.9142820835113525, "learning_rate": 0.0002, "epoch": 4.4989775051124745, "step": 3300}, {"loss": 1.2398, "grad_norm": 0.966867208480835, "learning_rate": 0.0002, "epoch": 4.5126107702794815, "step": 3310}, {"loss": 1.2537, "grad_norm": 1.0127079486846924, "learning_rate": 0.0002, "epoch": 4.5262440354464895, "step": 3320}, {"loss": 1.2059, "grad_norm": 1.055506706237793, "learning_rate": 0.0002, "epoch": 4.539877300613497, "step": 3330}, {"loss": 1.2958, "grad_norm": 0.9831468462944031, "learning_rate": 0.0002, "epoch": 4.553510565780504, "step": 3340}, {"loss": 1.2643, "grad_norm": 0.9304661154747009, "learning_rate": 0.0002, "epoch": 4.567143830947512, "step": 3350}, {"loss": 1.3621, "grad_norm": 0.9369107484817505, "learning_rate": 0.0002, "epoch": 4.580777096114519, "step": 3360}, {"loss": 1.2301, "grad_norm": 1.009506344795227, "learning_rate": 0.0002, "epoch": 4.594410361281527, "step": 3370}, {"loss": 1.2535, "grad_norm": 1.0575741529464722, "learning_rate": 0.0002, "epoch": 4.608043626448534, "step": 3380}, {"loss": 1.1914, "grad_norm": 0.9102860689163208, "learning_rate": 0.0002, "epoch": 4.621676891615542, "step": 3390}, {"loss": 1.3156, "grad_norm": 0.8111315965652466, "learning_rate": 0.0002, "epoch": 4.635310156782549, "step": 3400}, {"loss": 1.3103, "grad_norm": 0.9459649920463562, "learning_rate": 0.0002, "epoch": 4.648943421949557, "step": 3410}, {"loss": 1.3146, "grad_norm": 0.9709545969963074, "learning_rate": 0.0002, "epoch": 4.662576687116564, "step": 3420}, {"loss": 1.2958, "grad_norm": 0.9909247159957886, "learning_rate": 0.0002, "epoch": 4.676209952283572, "step": 3430}, {"loss": 1.3186, "grad_norm": 0.9094610810279846, "learning_rate": 0.0002, "epoch": 4.689843217450579, "step": 3440}, {"loss": 1.3397, "grad_norm": 0.9012220501899719, "learning_rate": 0.0002, "epoch": 4.703476482617587, "step": 3450}, {"loss": 1.2595, "grad_norm": 0.8669242858886719, "learning_rate": 0.0002, "epoch": 4.717109747784594, "step": 3460}, {"loss": 1.2762, "grad_norm": 0.9753699898719788, "learning_rate": 0.0002, "epoch": 4.730743012951602, "step": 3470}, {"loss": 1.2371, "grad_norm": 1.0252684354782104, "learning_rate": 0.0002, "epoch": 4.74437627811861, "step": 3480}, {"loss": 1.2536, "grad_norm": 1.208098292350769, "learning_rate": 0.0002, "epoch": 4.758009543285617, "step": 3490}, {"loss": 1.2256, "grad_norm": 0.8632914423942566, "learning_rate": 0.0002, "epoch": 4.771642808452625, "step": 3500}, {"loss": 1.3062, "grad_norm": 1.0084818601608276, "learning_rate": 0.0002, "epoch": 4.785276073619632, "step": 3510}, {"loss": 1.3004, "grad_norm": 0.9095172882080078, "learning_rate": 0.0002, "epoch": 4.79890933878664, "step": 3520}, {"loss": 1.263, "grad_norm": 0.9740135669708252, "learning_rate": 0.0002, "epoch": 4.812542603953647, "step": 3530}, {"loss": 1.2816, "grad_norm": 0.8862348794937134, "learning_rate": 0.0002, "epoch": 4.826175869120655, "step": 3540}, {"loss": 1.2275, "grad_norm": 1.0761774778366089, "learning_rate": 0.0002, "epoch": 4.839809134287662, "step": 3550}, {"loss": 1.2257, "grad_norm": 1.0134117603302002, "learning_rate": 0.0002, "epoch": 4.85344239945467, "step": 3560}, {"loss": 1.2904, "grad_norm": 0.9262851476669312, "learning_rate": 0.0002, "epoch": 4.867075664621677, "step": 3570}, {"loss": 1.1466, "grad_norm": 0.9518504738807678, "learning_rate": 0.0002, "epoch": 4.8807089297886845, "step": 3580}, {"loss": 1.2741, "grad_norm": 1.10103178024292, "learning_rate": 0.0002, "epoch": 4.894342194955692, "step": 3590}, {"loss": 1.2592, "grad_norm": 1.0133225917816162, "learning_rate": 0.0002, "epoch": 4.9079754601226995, "step": 3600}, {"loss": 1.2856, "grad_norm": 0.9637737274169922, "learning_rate": 0.0002, "epoch": 4.9216087252897065, "step": 3610}, {"loss": 1.2991, "grad_norm": 0.9800633192062378, "learning_rate": 0.0002, "epoch": 4.935241990456714, "step": 3620}, {"loss": 1.2872, "grad_norm": 1.0065973997116089, "learning_rate": 0.0002, "epoch": 4.948875255623722, "step": 3630}, {"loss": 1.2408, "grad_norm": 0.9354690313339233, "learning_rate": 0.0002, "epoch": 4.962508520790729, "step": 3640}, {"loss": 1.291, "grad_norm": 0.9744119048118591, "learning_rate": 0.0002, "epoch": 4.976141785957737, "step": 3650}, {"loss": 1.2513, "grad_norm": 0.9357708096504211, "learning_rate": 0.0002, "epoch": 4.989775051124744, "step": 3660}]} +{"epoch": 6.0, "step": 4401, "epoch_duration": 1095.5983347892761, "total_accumulated_duration": 6572.7824721336365, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}, {"eval_loss": 1.8527295589447021, "eval_runtime": 53.6403, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 2.9993183367416494, "step": 2200}, {"loss": 1.4719, "grad_norm": 0.5530241131782532, "learning_rate": 0.0002, "epoch": 3.0129516019086573, "step": 2210}, {"loss": 1.4088, "grad_norm": 0.6642216444015503, "learning_rate": 0.0002, "epoch": 3.0265848670756648, "step": 2220}, {"loss": 1.4382, "grad_norm": 0.61470627784729, "learning_rate": 0.0002, "epoch": 3.0402181322426722, "step": 2230}, {"loss": 1.4634, "grad_norm": 0.8559566140174866, "learning_rate": 0.0002, "epoch": 3.0538513974096797, "step": 2240}, {"loss": 1.3854, "grad_norm": 0.7015801668167114, "learning_rate": 0.0002, "epoch": 3.067484662576687, "step": 2250}, {"loss": 1.4981, "grad_norm": 0.7226442694664001, "learning_rate": 0.0002, "epoch": 3.0811179277436946, "step": 2260}, {"loss": 1.4143, "grad_norm": 0.7560588717460632, "learning_rate": 0.0002, "epoch": 3.094751192910702, "step": 2270}, {"loss": 1.4395, "grad_norm": 0.6216568946838379, "learning_rate": 0.0002, "epoch": 3.1083844580777096, "step": 2280}, {"loss": 1.3842, "grad_norm": 0.6768500804901123, "learning_rate": 0.0002, "epoch": 3.122017723244717, "step": 2290}, {"loss": 1.4672, "grad_norm": 0.7028762102127075, "learning_rate": 0.0002, "epoch": 3.1356509884117245, "step": 2300}, {"loss": 1.3826, "grad_norm": 0.6329697966575623, "learning_rate": 0.0002, "epoch": 3.149284253578732, "step": 2310}, {"loss": 1.442, "grad_norm": 0.6328264474868774, "learning_rate": 0.0002, "epoch": 3.1629175187457395, "step": 2320}, {"loss": 1.3762, "grad_norm": 0.7573632001876831, "learning_rate": 0.0002, "epoch": 3.176550783912747, "step": 2330}, {"loss": 1.3553, "grad_norm": 0.595740795135498, "learning_rate": 0.0002, "epoch": 3.190184049079755, "step": 2340}, {"loss": 1.3953, "grad_norm": 0.7111806869506836, "learning_rate": 0.0002, "epoch": 3.2038173142467623, "step": 2350}, {"loss": 1.3797, "grad_norm": 0.6328730583190918, "learning_rate": 0.0002, "epoch": 3.2174505794137698, "step": 2360}, {"loss": 1.3855, "grad_norm": 0.5860254168510437, "learning_rate": 0.0002, "epoch": 3.2310838445807772, "step": 2370}, {"loss": 1.4267, "grad_norm": 0.7387157082557678, "learning_rate": 0.0002, "epoch": 3.2447171097477847, "step": 2380}, {"loss": 1.4837, "grad_norm": 0.6897673606872559, "learning_rate": 0.0002, "epoch": 3.258350374914792, "step": 2390}, {"loss": 1.4372, "grad_norm": 0.7157699465751648, "learning_rate": 0.0002, "epoch": 3.2719836400817996, "step": 2400}, {"loss": 1.4432, "grad_norm": 0.6422511339187622, "learning_rate": 0.0002, "epoch": 3.285616905248807, "step": 2410}, {"loss": 1.4828, "grad_norm": 1.0481886863708496, "learning_rate": 0.0002, "epoch": 3.2992501704158146, "step": 2420}, {"loss": 1.4473, "grad_norm": 0.7050786018371582, "learning_rate": 0.0002, "epoch": 3.312883435582822, "step": 2430}, {"loss": 1.3465, "grad_norm": 0.6090759038925171, "learning_rate": 0.0002, "epoch": 3.3265167007498295, "step": 2440}, {"loss": 1.4619, "grad_norm": 0.6626465320587158, "learning_rate": 0.0002, "epoch": 3.340149965916837, "step": 2450}, {"loss": 1.4512, "grad_norm": 0.6565486788749695, "learning_rate": 0.0002, "epoch": 3.3537832310838445, "step": 2460}, {"loss": 1.588, "grad_norm": 0.6449528932571411, "learning_rate": 0.0002, "epoch": 3.367416496250852, "step": 2470}, {"loss": 1.4773, "grad_norm": 0.7746227383613586, "learning_rate": 0.0002, "epoch": 3.3810497614178594, "step": 2480}, {"loss": 1.417, "grad_norm": 0.7074846029281616, "learning_rate": 0.0002, "epoch": 3.3946830265848673, "step": 2490}, {"loss": 1.4476, "grad_norm": 0.6547690033912659, "learning_rate": 0.0002, "epoch": 3.4083162917518743, "step": 2500}, {"loss": 1.4074, "grad_norm": 0.784721314907074, "learning_rate": 0.0002, "epoch": 3.4219495569188823, "step": 2510}, {"loss": 1.4326, "grad_norm": 0.7270277738571167, "learning_rate": 0.0002, "epoch": 3.4355828220858897, "step": 2520}, {"loss": 1.4354, "grad_norm": 0.67588871717453, "learning_rate": 0.0002, "epoch": 3.449216087252897, "step": 2530}, {"loss": 1.4074, "grad_norm": 0.6768023371696472, "learning_rate": 0.0002, "epoch": 3.4628493524199047, "step": 2540}, {"loss": 1.4863, "grad_norm": 0.7026481628417969, "learning_rate": 0.0002, "epoch": 3.476482617586912, "step": 2550}, {"loss": 1.468, "grad_norm": 0.646075963973999, "learning_rate": 0.0002, "epoch": 3.4901158827539196, "step": 2560}, {"loss": 1.4058, "grad_norm": 0.6288973689079285, "learning_rate": 0.0002, "epoch": 3.503749147920927, "step": 2570}, {"loss": 1.4613, "grad_norm": 0.6440825462341309, "learning_rate": 0.0002, "epoch": 3.5173824130879345, "step": 2580}, {"loss": 1.3808, "grad_norm": 0.7074111700057983, "learning_rate": 0.0002, "epoch": 3.531015678254942, "step": 2590}, {"loss": 1.4901, "grad_norm": 0.7007562518119812, "learning_rate": 0.0002, "epoch": 3.5446489434219495, "step": 2600}, {"loss": 1.4511, "grad_norm": 0.6045376658439636, "learning_rate": 0.0002, "epoch": 3.558282208588957, "step": 2610}, {"loss": 1.4596, "grad_norm": 0.9149952530860901, "learning_rate": 0.0002, "epoch": 3.5719154737559644, "step": 2620}, {"loss": 1.4355, "grad_norm": 0.6490362882614136, "learning_rate": 0.0002, "epoch": 3.585548738922972, "step": 2630}, {"loss": 1.4107, "grad_norm": 0.6552226543426514, "learning_rate": 0.0002, "epoch": 3.59918200408998, "step": 2640}, {"loss": 1.433, "grad_norm": 0.6541850566864014, "learning_rate": 0.0002, "epoch": 3.612815269256987, "step": 2650}, {"loss": 1.4279, "grad_norm": 0.6500770449638367, "learning_rate": 0.0002, "epoch": 3.6264485344239947, "step": 2660}, {"loss": 1.3929, "grad_norm": 0.6345893740653992, "learning_rate": 0.0002, "epoch": 3.640081799591002, "step": 2670}, {"loss": 1.3634, "grad_norm": 0.6382275223731995, "learning_rate": 0.0002, "epoch": 3.6537150647580097, "step": 2680}, {"loss": 1.4478, "grad_norm": 0.6738566160202026, "learning_rate": 0.0002, "epoch": 3.667348329925017, "step": 2690}, {"loss": 1.4642, "grad_norm": 0.7446315288543701, "learning_rate": 0.0002, "epoch": 3.6809815950920246, "step": 2700}, {"loss": 1.4342, "grad_norm": 0.6717571020126343, "learning_rate": 0.0002, "epoch": 3.694614860259032, "step": 2710}, {"loss": 1.4285, "grad_norm": 0.667259693145752, "learning_rate": 0.0002, "epoch": 3.7082481254260395, "step": 2720}, {"loss": 1.5389, "grad_norm": 0.6808622479438782, "learning_rate": 0.0002, "epoch": 3.721881390593047, "step": 2730}, {"loss": 1.4297, "grad_norm": 0.7254287004470825, "learning_rate": 0.0002, "epoch": 3.7355146557600545, "step": 2740}, {"loss": 1.4176, "grad_norm": 0.6864007711410522, "learning_rate": 0.0002, "epoch": 3.749147920927062, "step": 2750}, {"loss": 1.4811, "grad_norm": 0.7041361331939697, "learning_rate": 0.0002, "epoch": 3.7627811860940694, "step": 2760}, {"loss": 1.4284, "grad_norm": 0.6559903025627136, "learning_rate": 0.0002, "epoch": 3.776414451261077, "step": 2770}, {"loss": 1.4608, "grad_norm": 0.6602269411087036, "learning_rate": 0.0002, "epoch": 3.7900477164280844, "step": 2780}, {"loss": 1.4588, "grad_norm": 0.692611813545227, "learning_rate": 0.0002, "epoch": 3.8036809815950923, "step": 2790}, {"loss": 1.4065, "grad_norm": 0.7051475644111633, "learning_rate": 0.0002, "epoch": 3.8173142467620993, "step": 2800}, {"loss": 1.4083, "grad_norm": 0.6685371398925781, "learning_rate": 0.0002, "epoch": 3.830947511929107, "step": 2810}, {"loss": 1.5227, "grad_norm": 0.6706477403640747, "learning_rate": 0.0002, "epoch": 3.8445807770961147, "step": 2820}, {"loss": 1.4076, "grad_norm": 0.6671637296676636, "learning_rate": 0.0002, "epoch": 3.858214042263122, "step": 2830}, {"loss": 1.4736, "grad_norm": 0.694092333316803, "learning_rate": 0.0002, "epoch": 3.8718473074301296, "step": 2840}, {"loss": 1.4161, "grad_norm": 0.7349600195884705, "learning_rate": 0.0002, "epoch": 3.885480572597137, "step": 2850}, {"loss": 1.4617, "grad_norm": 0.6647971868515015, "learning_rate": 0.0002, "epoch": 3.8991138377641446, "step": 2860}, {"loss": 1.5046, "grad_norm": 0.806656539440155, "learning_rate": 0.0002, "epoch": 3.912747102931152, "step": 2870}, {"loss": 1.428, "grad_norm": 0.6008772850036621, "learning_rate": 0.0002, "epoch": 3.9263803680981595, "step": 2880}, {"loss": 1.4116, "grad_norm": 0.659227728843689, "learning_rate": 0.0002, "epoch": 3.940013633265167, "step": 2890}, {"loss": 1.4136, "grad_norm": 0.6357656717300415, "learning_rate": 0.0002, "epoch": 3.9536468984321744, "step": 2900}, {"loss": 1.4655, "grad_norm": 0.6541687846183777, "learning_rate": 0.0002, "epoch": 3.967280163599182, "step": 2910}, {"loss": 1.4854, "grad_norm": 0.6090909838676453, "learning_rate": 0.0002, "epoch": 3.9809134287661894, "step": 2920}, {"loss": 1.4615, "grad_norm": 0.7198411822319031, "learning_rate": 0.0002, "epoch": 3.994546693933197, "step": 2930}, {"eval_loss": 1.9278366565704346, "eval_runtime": 53.6567, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 2934}, {"loss": 1.3159, "grad_norm": 0.6498575210571289, "learning_rate": 0.0002, "epoch": 4.008179959100205, "step": 2940}, {"loss": 1.2075, "grad_norm": 0.865602433681488, "learning_rate": 0.0002, "epoch": 4.021813224267212, "step": 2950}, {"loss": 1.1744, "grad_norm": 0.8514999151229858, "learning_rate": 0.0002, "epoch": 4.03544648943422, "step": 2960}, {"loss": 1.1553, "grad_norm": 1.0677322149276733, "learning_rate": 0.0002, "epoch": 4.049079754601227, "step": 2970}, {"loss": 1.1962, "grad_norm": 1.0126488208770752, "learning_rate": 0.0002, "epoch": 4.062713019768235, "step": 2980}, {"loss": 1.1631, "grad_norm": 1.0008870363235474, "learning_rate": 0.0002, "epoch": 4.076346284935242, "step": 2990}, {"loss": 1.2154, "grad_norm": 0.7942054271697998, "learning_rate": 0.0002, "epoch": 4.08997955010225, "step": 3000}, {"loss": 1.214, "grad_norm": 1.0482100248336792, "learning_rate": 0.0002, "epoch": 4.103612815269257, "step": 3010}, {"loss": 1.1999, "grad_norm": 1.0516992807388306, "learning_rate": 0.0002, "epoch": 4.1172460804362645, "step": 3020}, {"loss": 1.2108, "grad_norm": 0.8144322037696838, "learning_rate": 0.0002, "epoch": 4.130879345603272, "step": 3030}, {"loss": 1.1782, "grad_norm": 0.952297568321228, "learning_rate": 0.0002, "epoch": 4.144512610770279, "step": 3040}, {"loss": 1.2814, "grad_norm": 1.007645606994629, "learning_rate": 0.0002, "epoch": 4.158145875937287, "step": 3050}, {"loss": 1.1731, "grad_norm": 1.0480353832244873, "learning_rate": 0.0002, "epoch": 4.171779141104294, "step": 3060}, {"loss": 1.196, "grad_norm": 0.9270663857460022, "learning_rate": 0.0002, "epoch": 4.185412406271302, "step": 3070}, {"loss": 1.2167, "grad_norm": 1.3415262699127197, "learning_rate": 0.0002, "epoch": 4.199045671438309, "step": 3080}, {"loss": 1.2601, "grad_norm": 1.167606234550476, "learning_rate": 0.0002, "epoch": 4.212678936605317, "step": 3090}, {"loss": 1.2605, "grad_norm": 0.9418690800666809, "learning_rate": 0.0002, "epoch": 4.226312201772324, "step": 3100}, {"loss": 1.2184, "grad_norm": 1.0885876417160034, "learning_rate": 0.0002, "epoch": 4.239945466939332, "step": 3110}, {"loss": 1.2594, "grad_norm": 0.9165483713150024, "learning_rate": 0.0002, "epoch": 4.253578732106339, "step": 3120}, {"loss": 1.2933, "grad_norm": 0.9154694080352783, "learning_rate": 0.0002, "epoch": 4.267211997273347, "step": 3130}, {"loss": 1.2584, "grad_norm": 1.100580096244812, "learning_rate": 0.0002, "epoch": 4.280845262440354, "step": 3140}, {"loss": 1.251, "grad_norm": 0.9367576241493225, "learning_rate": 0.0002, "epoch": 4.294478527607362, "step": 3150}, {"loss": 1.2032, "grad_norm": 0.9744015336036682, "learning_rate": 0.0002, "epoch": 4.308111792774369, "step": 3160}, {"loss": 1.2787, "grad_norm": 0.9865175485610962, "learning_rate": 0.0002, "epoch": 4.321745057941377, "step": 3170}, {"loss": 1.2161, "grad_norm": 1.0124907493591309, "learning_rate": 0.0002, "epoch": 4.335378323108385, "step": 3180}, {"loss": 1.2452, "grad_norm": 1.1044819355010986, "learning_rate": 0.0002, "epoch": 4.349011588275392, "step": 3190}, {"loss": 1.2483, "grad_norm": 0.9305577278137207, "learning_rate": 0.0002, "epoch": 4.3626448534424, "step": 3200}, {"loss": 1.2101, "grad_norm": 0.969265341758728, "learning_rate": 0.0002, "epoch": 4.376278118609407, "step": 3210}, {"loss": 1.2355, "grad_norm": 1.0671923160552979, "learning_rate": 0.0002, "epoch": 4.389911383776415, "step": 3220}, {"loss": 1.2259, "grad_norm": 0.9440539479255676, "learning_rate": 0.0002, "epoch": 4.403544648943422, "step": 3230}, {"loss": 1.1706, "grad_norm": 0.9824562668800354, "learning_rate": 0.0002, "epoch": 4.41717791411043, "step": 3240}, {"loss": 1.2234, "grad_norm": 1.0245535373687744, "learning_rate": 0.0002, "epoch": 4.430811179277437, "step": 3250}, {"loss": 1.2713, "grad_norm": 0.9629312753677368, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 3260}, {"loss": 1.2689, "grad_norm": 1.1556470394134521, "learning_rate": 0.0002, "epoch": 4.458077709611452, "step": 3270}, {"loss": 1.2214, "grad_norm": 0.9796679019927979, "learning_rate": 0.0002, "epoch": 4.47171097477846, "step": 3280}, {"loss": 1.2823, "grad_norm": 0.9030535221099854, "learning_rate": 0.0002, "epoch": 4.485344239945467, "step": 3290}, {"loss": 1.2111, "grad_norm": 0.9142820835113525, "learning_rate": 0.0002, "epoch": 4.4989775051124745, "step": 3300}, {"loss": 1.2398, "grad_norm": 0.966867208480835, "learning_rate": 0.0002, "epoch": 4.5126107702794815, "step": 3310}, {"loss": 1.2537, "grad_norm": 1.0127079486846924, "learning_rate": 0.0002, "epoch": 4.5262440354464895, "step": 3320}, {"loss": 1.2059, "grad_norm": 1.055506706237793, "learning_rate": 0.0002, "epoch": 4.539877300613497, "step": 3330}, {"loss": 1.2958, "grad_norm": 0.9831468462944031, "learning_rate": 0.0002, "epoch": 4.553510565780504, "step": 3340}, {"loss": 1.2643, "grad_norm": 0.9304661154747009, "learning_rate": 0.0002, "epoch": 4.567143830947512, "step": 3350}, {"loss": 1.3621, "grad_norm": 0.9369107484817505, "learning_rate": 0.0002, "epoch": 4.580777096114519, "step": 3360}, {"loss": 1.2301, "grad_norm": 1.009506344795227, "learning_rate": 0.0002, "epoch": 4.594410361281527, "step": 3370}, {"loss": 1.2535, "grad_norm": 1.0575741529464722, "learning_rate": 0.0002, "epoch": 4.608043626448534, "step": 3380}, {"loss": 1.1914, "grad_norm": 0.9102860689163208, "learning_rate": 0.0002, "epoch": 4.621676891615542, "step": 3390}, {"loss": 1.3156, "grad_norm": 0.8111315965652466, "learning_rate": 0.0002, "epoch": 4.635310156782549, "step": 3400}, {"loss": 1.3103, "grad_norm": 0.9459649920463562, "learning_rate": 0.0002, "epoch": 4.648943421949557, "step": 3410}, {"loss": 1.3146, "grad_norm": 0.9709545969963074, "learning_rate": 0.0002, "epoch": 4.662576687116564, "step": 3420}, {"loss": 1.2958, "grad_norm": 0.9909247159957886, "learning_rate": 0.0002, "epoch": 4.676209952283572, "step": 3430}, {"loss": 1.3186, "grad_norm": 0.9094610810279846, "learning_rate": 0.0002, "epoch": 4.689843217450579, "step": 3440}, {"loss": 1.3397, "grad_norm": 0.9012220501899719, "learning_rate": 0.0002, "epoch": 4.703476482617587, "step": 3450}, {"loss": 1.2595, "grad_norm": 0.8669242858886719, "learning_rate": 0.0002, "epoch": 4.717109747784594, "step": 3460}, {"loss": 1.2762, "grad_norm": 0.9753699898719788, "learning_rate": 0.0002, "epoch": 4.730743012951602, "step": 3470}, {"loss": 1.2371, "grad_norm": 1.0252684354782104, "learning_rate": 0.0002, "epoch": 4.74437627811861, "step": 3480}, {"loss": 1.2536, "grad_norm": 1.208098292350769, "learning_rate": 0.0002, "epoch": 4.758009543285617, "step": 3490}, {"loss": 1.2256, "grad_norm": 0.8632914423942566, "learning_rate": 0.0002, "epoch": 4.771642808452625, "step": 3500}, {"loss": 1.3062, "grad_norm": 1.0084818601608276, "learning_rate": 0.0002, "epoch": 4.785276073619632, "step": 3510}, {"loss": 1.3004, "grad_norm": 0.9095172882080078, "learning_rate": 0.0002, "epoch": 4.79890933878664, "step": 3520}, {"loss": 1.263, "grad_norm": 0.9740135669708252, "learning_rate": 0.0002, "epoch": 4.812542603953647, "step": 3530}, {"loss": 1.2816, "grad_norm": 0.8862348794937134, "learning_rate": 0.0002, "epoch": 4.826175869120655, "step": 3540}, {"loss": 1.2275, "grad_norm": 1.0761774778366089, "learning_rate": 0.0002, "epoch": 4.839809134287662, "step": 3550}, {"loss": 1.2257, "grad_norm": 1.0134117603302002, "learning_rate": 0.0002, "epoch": 4.85344239945467, "step": 3560}, {"loss": 1.2904, "grad_norm": 0.9262851476669312, "learning_rate": 0.0002, "epoch": 4.867075664621677, "step": 3570}, {"loss": 1.1466, "grad_norm": 0.9518504738807678, "learning_rate": 0.0002, "epoch": 4.8807089297886845, "step": 3580}, {"loss": 1.2741, "grad_norm": 1.10103178024292, "learning_rate": 0.0002, "epoch": 4.894342194955692, "step": 3590}, {"loss": 1.2592, "grad_norm": 1.0133225917816162, "learning_rate": 0.0002, "epoch": 4.9079754601226995, "step": 3600}, {"loss": 1.2856, "grad_norm": 0.9637737274169922, "learning_rate": 0.0002, "epoch": 4.9216087252897065, "step": 3610}, {"loss": 1.2991, "grad_norm": 0.9800633192062378, "learning_rate": 0.0002, "epoch": 4.935241990456714, "step": 3620}, {"loss": 1.2872, "grad_norm": 1.0065973997116089, "learning_rate": 0.0002, "epoch": 4.948875255623722, "step": 3630}, {"loss": 1.2408, "grad_norm": 0.9354690313339233, "learning_rate": 0.0002, "epoch": 4.962508520790729, "step": 3640}, {"loss": 1.291, "grad_norm": 0.9744119048118591, "learning_rate": 0.0002, "epoch": 4.976141785957737, "step": 3650}, {"loss": 1.2513, "grad_norm": 0.9357708096504211, "learning_rate": 0.0002, "epoch": 4.989775051124744, "step": 3660}, {"eval_loss": 2.0763096809387207, "eval_runtime": 53.6578, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.999318336741649, "step": 3667}, {"loss": 1.2323, "grad_norm": 1.3171669244766235, "learning_rate": 0.0002, "epoch": 5.003408316291752, "step": 3670}, {"loss": 0.9509, "grad_norm": 1.4427374601364136, "learning_rate": 0.0002, "epoch": 5.017041581458759, "step": 3680}, {"loss": 1.011, "grad_norm": 0.9313797354698181, "learning_rate": 0.0002, "epoch": 5.030674846625767, "step": 3690}, {"loss": 0.9481, "grad_norm": 1.417641282081604, "learning_rate": 0.0002, "epoch": 5.044308111792774, "step": 3700}, {"loss": 0.9477, "grad_norm": 1.097440242767334, "learning_rate": 0.0002, "epoch": 5.057941376959782, "step": 3710}, {"loss": 1.0416, "grad_norm": 1.4277986288070679, "learning_rate": 0.0002, "epoch": 5.071574642126789, "step": 3720}, {"loss": 0.9718, "grad_norm": 1.2520873546600342, "learning_rate": 0.0002, "epoch": 5.085207907293797, "step": 3730}, {"loss": 0.9531, "grad_norm": 1.39503812789917, "learning_rate": 0.0002, "epoch": 5.098841172460804, "step": 3740}, {"loss": 0.9658, "grad_norm": 1.2345329523086548, "learning_rate": 0.0002, "epoch": 5.112474437627812, "step": 3750}, {"loss": 1.0615, "grad_norm": 1.2700239419937134, "learning_rate": 0.0002, "epoch": 5.126107702794819, "step": 3760}, {"loss": 0.993, "grad_norm": 1.5343066453933716, "learning_rate": 0.0002, "epoch": 5.139740967961827, "step": 3770}, {"loss": 0.9378, "grad_norm": 1.4191608428955078, "learning_rate": 0.0002, "epoch": 5.153374233128835, "step": 3780}, {"loss": 1.0179, "grad_norm": 1.4591023921966553, "learning_rate": 0.0002, "epoch": 5.167007498295842, "step": 3790}, {"loss": 1.0143, "grad_norm": 1.6158121824264526, "learning_rate": 0.0002, "epoch": 5.18064076346285, "step": 3800}, {"loss": 1.0056, "grad_norm": 1.6077582836151123, "learning_rate": 0.0002, "epoch": 5.194274028629857, "step": 3810}, {"loss": 0.9711, "grad_norm": 1.2815653085708618, "learning_rate": 0.0002, "epoch": 5.207907293796865, "step": 3820}, {"loss": 1.0131, "grad_norm": 1.2427219152450562, "learning_rate": 0.0002, "epoch": 5.221540558963872, "step": 3830}, {"loss": 0.9901, "grad_norm": 1.3013232946395874, "learning_rate": 0.0002, "epoch": 5.23517382413088, "step": 3840}, {"loss": 0.9862, "grad_norm": 1.4643588066101074, "learning_rate": 0.0002, "epoch": 5.248807089297887, "step": 3850}, {"loss": 1.0149, "grad_norm": 1.2571916580200195, "learning_rate": 0.0002, "epoch": 5.2624403544648946, "step": 3860}, {"loss": 0.9686, "grad_norm": 1.226682186126709, "learning_rate": 0.0002, "epoch": 5.276073619631902, "step": 3870}, {"loss": 0.9417, "grad_norm": 1.2541271448135376, "learning_rate": 0.0002, "epoch": 5.2897068847989095, "step": 3880}, {"loss": 0.9767, "grad_norm": 1.2340261936187744, "learning_rate": 0.0002, "epoch": 5.3033401499659165, "step": 3890}, {"loss": 1.0173, "grad_norm": 1.345527172088623, "learning_rate": 0.0002, "epoch": 5.316973415132924, "step": 3900}, {"loss": 1.0638, "grad_norm": 1.2128909826278687, "learning_rate": 0.0002, "epoch": 5.3306066802999315, "step": 3910}, {"loss": 1.0002, "grad_norm": 1.3052637577056885, "learning_rate": 0.0002, "epoch": 5.344239945466939, "step": 3920}, {"loss": 0.9754, "grad_norm": 1.1017392873764038, "learning_rate": 0.0002, "epoch": 5.357873210633947, "step": 3930}, {"loss": 1.0579, "grad_norm": 1.26950204372406, "learning_rate": 0.0002, "epoch": 5.371506475800954, "step": 3940}, {"loss": 1.0816, "grad_norm": 1.3372546434402466, "learning_rate": 0.0002, "epoch": 5.385139740967962, "step": 3950}, {"loss": 1.0529, "grad_norm": 1.3115156888961792, "learning_rate": 0.0002, "epoch": 5.398773006134969, "step": 3960}, {"loss": 1.1179, "grad_norm": 1.3511474132537842, "learning_rate": 0.0002, "epoch": 5.412406271301977, "step": 3970}, {"loss": 1.0352, "grad_norm": 1.1001893281936646, "learning_rate": 0.0002, "epoch": 5.426039536468984, "step": 3980}, {"loss": 1.0855, "grad_norm": 1.2810745239257812, "learning_rate": 0.0002, "epoch": 5.439672801635992, "step": 3990}, {"loss": 1.0573, "grad_norm": 1.2999306917190552, "learning_rate": 0.0002, "epoch": 5.453306066802999, "step": 4000}, {"loss": 1.0073, "grad_norm": 1.172553300857544, "learning_rate": 0.0002, "epoch": 5.466939331970007, "step": 4010}, {"loss": 1.003, "grad_norm": 1.1483557224273682, "learning_rate": 0.0002, "epoch": 5.480572597137014, "step": 4020}, {"loss": 1.0704, "grad_norm": 1.4148036241531372, "learning_rate": 0.0002, "epoch": 5.494205862304022, "step": 4030}, {"loss": 1.0519, "grad_norm": 1.1611121892929077, "learning_rate": 0.0002, "epoch": 5.507839127471029, "step": 4040}, {"loss": 1.0775, "grad_norm": 1.3837119340896606, "learning_rate": 0.0002, "epoch": 5.521472392638037, "step": 4050}, {"loss": 1.0257, "grad_norm": 1.3025696277618408, "learning_rate": 0.0002, "epoch": 5.535105657805044, "step": 4060}, {"loss": 1.0628, "grad_norm": 1.348091959953308, "learning_rate": 0.0002, "epoch": 5.548738922972052, "step": 4070}, {"loss": 1.0447, "grad_norm": 1.3463449478149414, "learning_rate": 0.0002, "epoch": 5.56237218813906, "step": 4080}, {"loss": 1.039, "grad_norm": 1.3904176950454712, "learning_rate": 0.0002, "epoch": 5.576005453306067, "step": 4090}, {"loss": 1.0963, "grad_norm": 1.2737950086593628, "learning_rate": 0.0002, "epoch": 5.589638718473074, "step": 4100}, {"loss": 1.0441, "grad_norm": 1.3311827182769775, "learning_rate": 0.0002, "epoch": 5.603271983640082, "step": 4110}, {"loss": 1.0521, "grad_norm": 1.24485182762146, "learning_rate": 0.0002, "epoch": 5.61690524880709, "step": 4120}, {"loss": 1.1103, "grad_norm": 1.2724957466125488, "learning_rate": 0.0002, "epoch": 5.630538513974097, "step": 4130}, {"loss": 1.0588, "grad_norm": 1.3439847230911255, "learning_rate": 0.0002, "epoch": 5.644171779141105, "step": 4140}, {"loss": 1.0257, "grad_norm": 1.372359037399292, "learning_rate": 0.0002, "epoch": 5.657805044308112, "step": 4150}, {"loss": 1.0475, "grad_norm": 1.2322949171066284, "learning_rate": 0.0002, "epoch": 5.6714383094751195, "step": 4160}, {"loss": 1.0465, "grad_norm": 1.4859193563461304, "learning_rate": 0.0002, "epoch": 5.6850715746421265, "step": 4170}, {"loss": 1.1569, "grad_norm": 1.4318448305130005, "learning_rate": 0.0002, "epoch": 5.6987048398091344, "step": 4180}, {"loss": 1.017, "grad_norm": 1.1533565521240234, "learning_rate": 0.0002, "epoch": 5.7123381049761415, "step": 4190}, {"loss": 1.0948, "grad_norm": 1.3009696006774902, "learning_rate": 0.0002, "epoch": 5.725971370143149, "step": 4200}, {"loss": 1.1229, "grad_norm": 1.3972162008285522, "learning_rate": 0.0002, "epoch": 5.739604635310156, "step": 4210}, {"loss": 1.033, "grad_norm": 1.2142186164855957, "learning_rate": 0.0002, "epoch": 5.753237900477164, "step": 4220}, {"loss": 1.0588, "grad_norm": 1.401191234588623, "learning_rate": 0.0002, "epoch": 5.766871165644172, "step": 4230}, {"loss": 1.0722, "grad_norm": 1.4124404191970825, "learning_rate": 0.0002, "epoch": 5.780504430811179, "step": 4240}, {"loss": 1.0826, "grad_norm": 1.3488332033157349, "learning_rate": 0.0002, "epoch": 5.794137695978186, "step": 4250}, {"loss": 1.0599, "grad_norm": 1.3671752214431763, "learning_rate": 0.0002, "epoch": 5.807770961145194, "step": 4260}, {"loss": 1.1294, "grad_norm": 1.2608201503753662, "learning_rate": 0.0002, "epoch": 5.821404226312202, "step": 4270}, {"loss": 1.1216, "grad_norm": 1.1814045906066895, "learning_rate": 0.0002, "epoch": 5.835037491479209, "step": 4280}, {"loss": 1.0973, "grad_norm": 1.4139586687088013, "learning_rate": 0.0002, "epoch": 5.848670756646217, "step": 4290}, {"loss": 1.0656, "grad_norm": 1.34248948097229, "learning_rate": 0.0002, "epoch": 5.862304021813224, "step": 4300}, {"loss": 1.0791, "grad_norm": 1.1428139209747314, "learning_rate": 0.0002, "epoch": 5.875937286980232, "step": 4310}, {"loss": 1.0556, "grad_norm": 1.1941087245941162, "learning_rate": 0.0002, "epoch": 5.889570552147239, "step": 4320}, {"loss": 1.1089, "grad_norm": 1.2374001741409302, "learning_rate": 0.0002, "epoch": 5.903203817314247, "step": 4330}, {"loss": 1.0802, "grad_norm": 1.4314988851547241, "learning_rate": 0.0002, "epoch": 5.916837082481254, "step": 4340}, {"loss": 1.133, "grad_norm": 1.1286126375198364, "learning_rate": 0.0002, "epoch": 5.930470347648262, "step": 4350}, {"loss": 1.0807, "grad_norm": 1.25884211063385, "learning_rate": 0.0002, "epoch": 5.944103612815269, "step": 4360}, {"loss": 1.1189, "grad_norm": 1.223357915878296, "learning_rate": 0.0002, "epoch": 5.957736877982277, "step": 4370}, {"loss": 1.1335, "grad_norm": 1.2173810005187988, "learning_rate": 0.0002, "epoch": 5.971370143149285, "step": 4380}, {"loss": 1.1201, "grad_norm": 1.3152292966842651, "learning_rate": 0.0002, "epoch": 5.985003408316292, "step": 4390}, {"loss": 1.1456, "grad_norm": 1.5576739311218262, "learning_rate": 0.0002, "epoch": 5.998636673483299, "step": 4400}]} +{"epoch": 6.999318336741649, "step": 5134, "epoch_duration": 1094.620125055313, "total_accumulated_duration": 7667.40259718895, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}, {"eval_loss": 1.8527295589447021, "eval_runtime": 53.6403, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 2.9993183367416494, "step": 2200}, {"loss": 1.4719, "grad_norm": 0.5530241131782532, "learning_rate": 0.0002, "epoch": 3.0129516019086573, "step": 2210}, {"loss": 1.4088, "grad_norm": 0.6642216444015503, "learning_rate": 0.0002, "epoch": 3.0265848670756648, "step": 2220}, {"loss": 1.4382, "grad_norm": 0.61470627784729, "learning_rate": 0.0002, "epoch": 3.0402181322426722, "step": 2230}, {"loss": 1.4634, "grad_norm": 0.8559566140174866, "learning_rate": 0.0002, "epoch": 3.0538513974096797, "step": 2240}, {"loss": 1.3854, "grad_norm": 0.7015801668167114, "learning_rate": 0.0002, "epoch": 3.067484662576687, "step": 2250}, {"loss": 1.4981, "grad_norm": 0.7226442694664001, "learning_rate": 0.0002, "epoch": 3.0811179277436946, "step": 2260}, {"loss": 1.4143, "grad_norm": 0.7560588717460632, "learning_rate": 0.0002, "epoch": 3.094751192910702, "step": 2270}, {"loss": 1.4395, "grad_norm": 0.6216568946838379, "learning_rate": 0.0002, "epoch": 3.1083844580777096, "step": 2280}, {"loss": 1.3842, "grad_norm": 0.6768500804901123, "learning_rate": 0.0002, "epoch": 3.122017723244717, "step": 2290}, {"loss": 1.4672, "grad_norm": 0.7028762102127075, "learning_rate": 0.0002, "epoch": 3.1356509884117245, "step": 2300}, {"loss": 1.3826, "grad_norm": 0.6329697966575623, "learning_rate": 0.0002, "epoch": 3.149284253578732, "step": 2310}, {"loss": 1.442, "grad_norm": 0.6328264474868774, "learning_rate": 0.0002, "epoch": 3.1629175187457395, "step": 2320}, {"loss": 1.3762, "grad_norm": 0.7573632001876831, "learning_rate": 0.0002, "epoch": 3.176550783912747, "step": 2330}, {"loss": 1.3553, "grad_norm": 0.595740795135498, "learning_rate": 0.0002, "epoch": 3.190184049079755, "step": 2340}, {"loss": 1.3953, "grad_norm": 0.7111806869506836, "learning_rate": 0.0002, "epoch": 3.2038173142467623, "step": 2350}, {"loss": 1.3797, "grad_norm": 0.6328730583190918, "learning_rate": 0.0002, "epoch": 3.2174505794137698, "step": 2360}, {"loss": 1.3855, "grad_norm": 0.5860254168510437, "learning_rate": 0.0002, "epoch": 3.2310838445807772, "step": 2370}, {"loss": 1.4267, "grad_norm": 0.7387157082557678, "learning_rate": 0.0002, "epoch": 3.2447171097477847, "step": 2380}, {"loss": 1.4837, "grad_norm": 0.6897673606872559, "learning_rate": 0.0002, "epoch": 3.258350374914792, "step": 2390}, {"loss": 1.4372, "grad_norm": 0.7157699465751648, "learning_rate": 0.0002, "epoch": 3.2719836400817996, "step": 2400}, {"loss": 1.4432, "grad_norm": 0.6422511339187622, "learning_rate": 0.0002, "epoch": 3.285616905248807, "step": 2410}, {"loss": 1.4828, "grad_norm": 1.0481886863708496, "learning_rate": 0.0002, "epoch": 3.2992501704158146, "step": 2420}, {"loss": 1.4473, "grad_norm": 0.7050786018371582, "learning_rate": 0.0002, "epoch": 3.312883435582822, "step": 2430}, {"loss": 1.3465, "grad_norm": 0.6090759038925171, "learning_rate": 0.0002, "epoch": 3.3265167007498295, "step": 2440}, {"loss": 1.4619, "grad_norm": 0.6626465320587158, "learning_rate": 0.0002, "epoch": 3.340149965916837, "step": 2450}, {"loss": 1.4512, "grad_norm": 0.6565486788749695, "learning_rate": 0.0002, "epoch": 3.3537832310838445, "step": 2460}, {"loss": 1.588, "grad_norm": 0.6449528932571411, "learning_rate": 0.0002, "epoch": 3.367416496250852, "step": 2470}, {"loss": 1.4773, "grad_norm": 0.7746227383613586, "learning_rate": 0.0002, "epoch": 3.3810497614178594, "step": 2480}, {"loss": 1.417, "grad_norm": 0.7074846029281616, "learning_rate": 0.0002, "epoch": 3.3946830265848673, "step": 2490}, {"loss": 1.4476, "grad_norm": 0.6547690033912659, "learning_rate": 0.0002, "epoch": 3.4083162917518743, "step": 2500}, {"loss": 1.4074, "grad_norm": 0.784721314907074, "learning_rate": 0.0002, "epoch": 3.4219495569188823, "step": 2510}, {"loss": 1.4326, "grad_norm": 0.7270277738571167, "learning_rate": 0.0002, "epoch": 3.4355828220858897, "step": 2520}, {"loss": 1.4354, "grad_norm": 0.67588871717453, "learning_rate": 0.0002, "epoch": 3.449216087252897, "step": 2530}, {"loss": 1.4074, "grad_norm": 0.6768023371696472, "learning_rate": 0.0002, "epoch": 3.4628493524199047, "step": 2540}, {"loss": 1.4863, "grad_norm": 0.7026481628417969, "learning_rate": 0.0002, "epoch": 3.476482617586912, "step": 2550}, {"loss": 1.468, "grad_norm": 0.646075963973999, "learning_rate": 0.0002, "epoch": 3.4901158827539196, "step": 2560}, {"loss": 1.4058, "grad_norm": 0.6288973689079285, "learning_rate": 0.0002, "epoch": 3.503749147920927, "step": 2570}, {"loss": 1.4613, "grad_norm": 0.6440825462341309, "learning_rate": 0.0002, "epoch": 3.5173824130879345, "step": 2580}, {"loss": 1.3808, "grad_norm": 0.7074111700057983, "learning_rate": 0.0002, "epoch": 3.531015678254942, "step": 2590}, {"loss": 1.4901, "grad_norm": 0.7007562518119812, "learning_rate": 0.0002, "epoch": 3.5446489434219495, "step": 2600}, {"loss": 1.4511, "grad_norm": 0.6045376658439636, "learning_rate": 0.0002, "epoch": 3.558282208588957, "step": 2610}, {"loss": 1.4596, "grad_norm": 0.9149952530860901, "learning_rate": 0.0002, "epoch": 3.5719154737559644, "step": 2620}, {"loss": 1.4355, "grad_norm": 0.6490362882614136, "learning_rate": 0.0002, "epoch": 3.585548738922972, "step": 2630}, {"loss": 1.4107, "grad_norm": 0.6552226543426514, "learning_rate": 0.0002, "epoch": 3.59918200408998, "step": 2640}, {"loss": 1.433, "grad_norm": 0.6541850566864014, "learning_rate": 0.0002, "epoch": 3.612815269256987, "step": 2650}, {"loss": 1.4279, "grad_norm": 0.6500770449638367, "learning_rate": 0.0002, "epoch": 3.6264485344239947, "step": 2660}, {"loss": 1.3929, "grad_norm": 0.6345893740653992, "learning_rate": 0.0002, "epoch": 3.640081799591002, "step": 2670}, {"loss": 1.3634, "grad_norm": 0.6382275223731995, "learning_rate": 0.0002, "epoch": 3.6537150647580097, "step": 2680}, {"loss": 1.4478, "grad_norm": 0.6738566160202026, "learning_rate": 0.0002, "epoch": 3.667348329925017, "step": 2690}, {"loss": 1.4642, "grad_norm": 0.7446315288543701, "learning_rate": 0.0002, "epoch": 3.6809815950920246, "step": 2700}, {"loss": 1.4342, "grad_norm": 0.6717571020126343, "learning_rate": 0.0002, "epoch": 3.694614860259032, "step": 2710}, {"loss": 1.4285, "grad_norm": 0.667259693145752, "learning_rate": 0.0002, "epoch": 3.7082481254260395, "step": 2720}, {"loss": 1.5389, "grad_norm": 0.6808622479438782, "learning_rate": 0.0002, "epoch": 3.721881390593047, "step": 2730}, {"loss": 1.4297, "grad_norm": 0.7254287004470825, "learning_rate": 0.0002, "epoch": 3.7355146557600545, "step": 2740}, {"loss": 1.4176, "grad_norm": 0.6864007711410522, "learning_rate": 0.0002, "epoch": 3.749147920927062, "step": 2750}, {"loss": 1.4811, "grad_norm": 0.7041361331939697, "learning_rate": 0.0002, "epoch": 3.7627811860940694, "step": 2760}, {"loss": 1.4284, "grad_norm": 0.6559903025627136, "learning_rate": 0.0002, "epoch": 3.776414451261077, "step": 2770}, {"loss": 1.4608, "grad_norm": 0.6602269411087036, "learning_rate": 0.0002, "epoch": 3.7900477164280844, "step": 2780}, {"loss": 1.4588, "grad_norm": 0.692611813545227, "learning_rate": 0.0002, "epoch": 3.8036809815950923, "step": 2790}, {"loss": 1.4065, "grad_norm": 0.7051475644111633, "learning_rate": 0.0002, "epoch": 3.8173142467620993, "step": 2800}, {"loss": 1.4083, "grad_norm": 0.6685371398925781, "learning_rate": 0.0002, "epoch": 3.830947511929107, "step": 2810}, {"loss": 1.5227, "grad_norm": 0.6706477403640747, "learning_rate": 0.0002, "epoch": 3.8445807770961147, "step": 2820}, {"loss": 1.4076, "grad_norm": 0.6671637296676636, "learning_rate": 0.0002, "epoch": 3.858214042263122, "step": 2830}, {"loss": 1.4736, "grad_norm": 0.694092333316803, "learning_rate": 0.0002, "epoch": 3.8718473074301296, "step": 2840}, {"loss": 1.4161, "grad_norm": 0.7349600195884705, "learning_rate": 0.0002, "epoch": 3.885480572597137, "step": 2850}, {"loss": 1.4617, "grad_norm": 0.6647971868515015, "learning_rate": 0.0002, "epoch": 3.8991138377641446, "step": 2860}, {"loss": 1.5046, "grad_norm": 0.806656539440155, "learning_rate": 0.0002, "epoch": 3.912747102931152, "step": 2870}, {"loss": 1.428, "grad_norm": 0.6008772850036621, "learning_rate": 0.0002, "epoch": 3.9263803680981595, "step": 2880}, {"loss": 1.4116, "grad_norm": 0.659227728843689, "learning_rate": 0.0002, "epoch": 3.940013633265167, "step": 2890}, {"loss": 1.4136, "grad_norm": 0.6357656717300415, "learning_rate": 0.0002, "epoch": 3.9536468984321744, "step": 2900}, {"loss": 1.4655, "grad_norm": 0.6541687846183777, "learning_rate": 0.0002, "epoch": 3.967280163599182, "step": 2910}, {"loss": 1.4854, "grad_norm": 0.6090909838676453, "learning_rate": 0.0002, "epoch": 3.9809134287661894, "step": 2920}, {"loss": 1.4615, "grad_norm": 0.7198411822319031, "learning_rate": 0.0002, "epoch": 3.994546693933197, "step": 2930}, {"eval_loss": 1.9278366565704346, "eval_runtime": 53.6567, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 2934}, {"loss": 1.3159, "grad_norm": 0.6498575210571289, "learning_rate": 0.0002, "epoch": 4.008179959100205, "step": 2940}, {"loss": 1.2075, "grad_norm": 0.865602433681488, "learning_rate": 0.0002, "epoch": 4.021813224267212, "step": 2950}, {"loss": 1.1744, "grad_norm": 0.8514999151229858, "learning_rate": 0.0002, "epoch": 4.03544648943422, "step": 2960}, {"loss": 1.1553, "grad_norm": 1.0677322149276733, "learning_rate": 0.0002, "epoch": 4.049079754601227, "step": 2970}, {"loss": 1.1962, "grad_norm": 1.0126488208770752, "learning_rate": 0.0002, "epoch": 4.062713019768235, "step": 2980}, {"loss": 1.1631, "grad_norm": 1.0008870363235474, "learning_rate": 0.0002, "epoch": 4.076346284935242, "step": 2990}, {"loss": 1.2154, "grad_norm": 0.7942054271697998, "learning_rate": 0.0002, "epoch": 4.08997955010225, "step": 3000}, {"loss": 1.214, "grad_norm": 1.0482100248336792, "learning_rate": 0.0002, "epoch": 4.103612815269257, "step": 3010}, {"loss": 1.1999, "grad_norm": 1.0516992807388306, "learning_rate": 0.0002, "epoch": 4.1172460804362645, "step": 3020}, {"loss": 1.2108, "grad_norm": 0.8144322037696838, "learning_rate": 0.0002, "epoch": 4.130879345603272, "step": 3030}, {"loss": 1.1782, "grad_norm": 0.952297568321228, "learning_rate": 0.0002, "epoch": 4.144512610770279, "step": 3040}, {"loss": 1.2814, "grad_norm": 1.007645606994629, "learning_rate": 0.0002, "epoch": 4.158145875937287, "step": 3050}, {"loss": 1.1731, "grad_norm": 1.0480353832244873, "learning_rate": 0.0002, "epoch": 4.171779141104294, "step": 3060}, {"loss": 1.196, "grad_norm": 0.9270663857460022, "learning_rate": 0.0002, "epoch": 4.185412406271302, "step": 3070}, {"loss": 1.2167, "grad_norm": 1.3415262699127197, "learning_rate": 0.0002, "epoch": 4.199045671438309, "step": 3080}, {"loss": 1.2601, "grad_norm": 1.167606234550476, "learning_rate": 0.0002, "epoch": 4.212678936605317, "step": 3090}, {"loss": 1.2605, "grad_norm": 0.9418690800666809, "learning_rate": 0.0002, "epoch": 4.226312201772324, "step": 3100}, {"loss": 1.2184, "grad_norm": 1.0885876417160034, "learning_rate": 0.0002, "epoch": 4.239945466939332, "step": 3110}, {"loss": 1.2594, "grad_norm": 0.9165483713150024, "learning_rate": 0.0002, "epoch": 4.253578732106339, "step": 3120}, {"loss": 1.2933, "grad_norm": 0.9154694080352783, "learning_rate": 0.0002, "epoch": 4.267211997273347, "step": 3130}, {"loss": 1.2584, "grad_norm": 1.100580096244812, "learning_rate": 0.0002, "epoch": 4.280845262440354, "step": 3140}, {"loss": 1.251, "grad_norm": 0.9367576241493225, "learning_rate": 0.0002, "epoch": 4.294478527607362, "step": 3150}, {"loss": 1.2032, "grad_norm": 0.9744015336036682, "learning_rate": 0.0002, "epoch": 4.308111792774369, "step": 3160}, {"loss": 1.2787, "grad_norm": 0.9865175485610962, "learning_rate": 0.0002, "epoch": 4.321745057941377, "step": 3170}, {"loss": 1.2161, "grad_norm": 1.0124907493591309, "learning_rate": 0.0002, "epoch": 4.335378323108385, "step": 3180}, {"loss": 1.2452, "grad_norm": 1.1044819355010986, "learning_rate": 0.0002, "epoch": 4.349011588275392, "step": 3190}, {"loss": 1.2483, "grad_norm": 0.9305577278137207, "learning_rate": 0.0002, "epoch": 4.3626448534424, "step": 3200}, {"loss": 1.2101, "grad_norm": 0.969265341758728, "learning_rate": 0.0002, "epoch": 4.376278118609407, "step": 3210}, {"loss": 1.2355, "grad_norm": 1.0671923160552979, "learning_rate": 0.0002, "epoch": 4.389911383776415, "step": 3220}, {"loss": 1.2259, "grad_norm": 0.9440539479255676, "learning_rate": 0.0002, "epoch": 4.403544648943422, "step": 3230}, {"loss": 1.1706, "grad_norm": 0.9824562668800354, "learning_rate": 0.0002, "epoch": 4.41717791411043, "step": 3240}, {"loss": 1.2234, "grad_norm": 1.0245535373687744, "learning_rate": 0.0002, "epoch": 4.430811179277437, "step": 3250}, {"loss": 1.2713, "grad_norm": 0.9629312753677368, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 3260}, {"loss": 1.2689, "grad_norm": 1.1556470394134521, "learning_rate": 0.0002, "epoch": 4.458077709611452, "step": 3270}, {"loss": 1.2214, "grad_norm": 0.9796679019927979, "learning_rate": 0.0002, "epoch": 4.47171097477846, "step": 3280}, {"loss": 1.2823, "grad_norm": 0.9030535221099854, "learning_rate": 0.0002, "epoch": 4.485344239945467, "step": 3290}, {"loss": 1.2111, "grad_norm": 0.9142820835113525, "learning_rate": 0.0002, "epoch": 4.4989775051124745, "step": 3300}, {"loss": 1.2398, "grad_norm": 0.966867208480835, "learning_rate": 0.0002, "epoch": 4.5126107702794815, "step": 3310}, {"loss": 1.2537, "grad_norm": 1.0127079486846924, "learning_rate": 0.0002, "epoch": 4.5262440354464895, "step": 3320}, {"loss": 1.2059, "grad_norm": 1.055506706237793, "learning_rate": 0.0002, "epoch": 4.539877300613497, "step": 3330}, {"loss": 1.2958, "grad_norm": 0.9831468462944031, "learning_rate": 0.0002, "epoch": 4.553510565780504, "step": 3340}, {"loss": 1.2643, "grad_norm": 0.9304661154747009, "learning_rate": 0.0002, "epoch": 4.567143830947512, "step": 3350}, {"loss": 1.3621, "grad_norm": 0.9369107484817505, "learning_rate": 0.0002, "epoch": 4.580777096114519, "step": 3360}, {"loss": 1.2301, "grad_norm": 1.009506344795227, "learning_rate": 0.0002, "epoch": 4.594410361281527, "step": 3370}, {"loss": 1.2535, "grad_norm": 1.0575741529464722, "learning_rate": 0.0002, "epoch": 4.608043626448534, "step": 3380}, {"loss": 1.1914, "grad_norm": 0.9102860689163208, "learning_rate": 0.0002, "epoch": 4.621676891615542, "step": 3390}, {"loss": 1.3156, "grad_norm": 0.8111315965652466, "learning_rate": 0.0002, "epoch": 4.635310156782549, "step": 3400}, {"loss": 1.3103, "grad_norm": 0.9459649920463562, "learning_rate": 0.0002, "epoch": 4.648943421949557, "step": 3410}, {"loss": 1.3146, "grad_norm": 0.9709545969963074, "learning_rate": 0.0002, "epoch": 4.662576687116564, "step": 3420}, {"loss": 1.2958, "grad_norm": 0.9909247159957886, "learning_rate": 0.0002, "epoch": 4.676209952283572, "step": 3430}, {"loss": 1.3186, "grad_norm": 0.9094610810279846, "learning_rate": 0.0002, "epoch": 4.689843217450579, "step": 3440}, {"loss": 1.3397, "grad_norm": 0.9012220501899719, "learning_rate": 0.0002, "epoch": 4.703476482617587, "step": 3450}, {"loss": 1.2595, "grad_norm": 0.8669242858886719, "learning_rate": 0.0002, "epoch": 4.717109747784594, "step": 3460}, {"loss": 1.2762, "grad_norm": 0.9753699898719788, "learning_rate": 0.0002, "epoch": 4.730743012951602, "step": 3470}, {"loss": 1.2371, "grad_norm": 1.0252684354782104, "learning_rate": 0.0002, "epoch": 4.74437627811861, "step": 3480}, {"loss": 1.2536, "grad_norm": 1.208098292350769, "learning_rate": 0.0002, "epoch": 4.758009543285617, "step": 3490}, {"loss": 1.2256, "grad_norm": 0.8632914423942566, "learning_rate": 0.0002, "epoch": 4.771642808452625, "step": 3500}, {"loss": 1.3062, "grad_norm": 1.0084818601608276, "learning_rate": 0.0002, "epoch": 4.785276073619632, "step": 3510}, {"loss": 1.3004, "grad_norm": 0.9095172882080078, "learning_rate": 0.0002, "epoch": 4.79890933878664, "step": 3520}, {"loss": 1.263, "grad_norm": 0.9740135669708252, "learning_rate": 0.0002, "epoch": 4.812542603953647, "step": 3530}, {"loss": 1.2816, "grad_norm": 0.8862348794937134, "learning_rate": 0.0002, "epoch": 4.826175869120655, "step": 3540}, {"loss": 1.2275, "grad_norm": 1.0761774778366089, "learning_rate": 0.0002, "epoch": 4.839809134287662, "step": 3550}, {"loss": 1.2257, "grad_norm": 1.0134117603302002, "learning_rate": 0.0002, "epoch": 4.85344239945467, "step": 3560}, {"loss": 1.2904, "grad_norm": 0.9262851476669312, "learning_rate": 0.0002, "epoch": 4.867075664621677, "step": 3570}, {"loss": 1.1466, "grad_norm": 0.9518504738807678, "learning_rate": 0.0002, "epoch": 4.8807089297886845, "step": 3580}, {"loss": 1.2741, "grad_norm": 1.10103178024292, "learning_rate": 0.0002, "epoch": 4.894342194955692, "step": 3590}, {"loss": 1.2592, "grad_norm": 1.0133225917816162, "learning_rate": 0.0002, "epoch": 4.9079754601226995, "step": 3600}, {"loss": 1.2856, "grad_norm": 0.9637737274169922, "learning_rate": 0.0002, "epoch": 4.9216087252897065, "step": 3610}, {"loss": 1.2991, "grad_norm": 0.9800633192062378, "learning_rate": 0.0002, "epoch": 4.935241990456714, "step": 3620}, {"loss": 1.2872, "grad_norm": 1.0065973997116089, "learning_rate": 0.0002, "epoch": 4.948875255623722, "step": 3630}, {"loss": 1.2408, "grad_norm": 0.9354690313339233, "learning_rate": 0.0002, "epoch": 4.962508520790729, "step": 3640}, {"loss": 1.291, "grad_norm": 0.9744119048118591, "learning_rate": 0.0002, "epoch": 4.976141785957737, "step": 3650}, {"loss": 1.2513, "grad_norm": 0.9357708096504211, "learning_rate": 0.0002, "epoch": 4.989775051124744, "step": 3660}, {"eval_loss": 2.0763096809387207, "eval_runtime": 53.6578, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.999318336741649, "step": 3667}, {"loss": 1.2323, "grad_norm": 1.3171669244766235, "learning_rate": 0.0002, "epoch": 5.003408316291752, "step": 3670}, {"loss": 0.9509, "grad_norm": 1.4427374601364136, "learning_rate": 0.0002, "epoch": 5.017041581458759, "step": 3680}, {"loss": 1.011, "grad_norm": 0.9313797354698181, "learning_rate": 0.0002, "epoch": 5.030674846625767, "step": 3690}, {"loss": 0.9481, "grad_norm": 1.417641282081604, "learning_rate": 0.0002, "epoch": 5.044308111792774, "step": 3700}, {"loss": 0.9477, "grad_norm": 1.097440242767334, "learning_rate": 0.0002, "epoch": 5.057941376959782, "step": 3710}, {"loss": 1.0416, "grad_norm": 1.4277986288070679, "learning_rate": 0.0002, "epoch": 5.071574642126789, "step": 3720}, {"loss": 0.9718, "grad_norm": 1.2520873546600342, "learning_rate": 0.0002, "epoch": 5.085207907293797, "step": 3730}, {"loss": 0.9531, "grad_norm": 1.39503812789917, "learning_rate": 0.0002, "epoch": 5.098841172460804, "step": 3740}, {"loss": 0.9658, "grad_norm": 1.2345329523086548, "learning_rate": 0.0002, "epoch": 5.112474437627812, "step": 3750}, {"loss": 1.0615, "grad_norm": 1.2700239419937134, "learning_rate": 0.0002, "epoch": 5.126107702794819, "step": 3760}, {"loss": 0.993, "grad_norm": 1.5343066453933716, "learning_rate": 0.0002, "epoch": 5.139740967961827, "step": 3770}, {"loss": 0.9378, "grad_norm": 1.4191608428955078, "learning_rate": 0.0002, "epoch": 5.153374233128835, "step": 3780}, {"loss": 1.0179, "grad_norm": 1.4591023921966553, "learning_rate": 0.0002, "epoch": 5.167007498295842, "step": 3790}, {"loss": 1.0143, "grad_norm": 1.6158121824264526, "learning_rate": 0.0002, "epoch": 5.18064076346285, "step": 3800}, {"loss": 1.0056, "grad_norm": 1.6077582836151123, "learning_rate": 0.0002, "epoch": 5.194274028629857, "step": 3810}, {"loss": 0.9711, "grad_norm": 1.2815653085708618, "learning_rate": 0.0002, "epoch": 5.207907293796865, "step": 3820}, {"loss": 1.0131, "grad_norm": 1.2427219152450562, "learning_rate": 0.0002, "epoch": 5.221540558963872, "step": 3830}, {"loss": 0.9901, "grad_norm": 1.3013232946395874, "learning_rate": 0.0002, "epoch": 5.23517382413088, "step": 3840}, {"loss": 0.9862, "grad_norm": 1.4643588066101074, "learning_rate": 0.0002, "epoch": 5.248807089297887, "step": 3850}, {"loss": 1.0149, "grad_norm": 1.2571916580200195, "learning_rate": 0.0002, "epoch": 5.2624403544648946, "step": 3860}, {"loss": 0.9686, "grad_norm": 1.226682186126709, "learning_rate": 0.0002, "epoch": 5.276073619631902, "step": 3870}, {"loss": 0.9417, "grad_norm": 1.2541271448135376, "learning_rate": 0.0002, "epoch": 5.2897068847989095, "step": 3880}, {"loss": 0.9767, "grad_norm": 1.2340261936187744, "learning_rate": 0.0002, "epoch": 5.3033401499659165, "step": 3890}, {"loss": 1.0173, "grad_norm": 1.345527172088623, "learning_rate": 0.0002, "epoch": 5.316973415132924, "step": 3900}, {"loss": 1.0638, "grad_norm": 1.2128909826278687, "learning_rate": 0.0002, "epoch": 5.3306066802999315, "step": 3910}, {"loss": 1.0002, "grad_norm": 1.3052637577056885, "learning_rate": 0.0002, "epoch": 5.344239945466939, "step": 3920}, {"loss": 0.9754, "grad_norm": 1.1017392873764038, "learning_rate": 0.0002, "epoch": 5.357873210633947, "step": 3930}, {"loss": 1.0579, "grad_norm": 1.26950204372406, "learning_rate": 0.0002, "epoch": 5.371506475800954, "step": 3940}, {"loss": 1.0816, "grad_norm": 1.3372546434402466, "learning_rate": 0.0002, "epoch": 5.385139740967962, "step": 3950}, {"loss": 1.0529, "grad_norm": 1.3115156888961792, "learning_rate": 0.0002, "epoch": 5.398773006134969, "step": 3960}, {"loss": 1.1179, "grad_norm": 1.3511474132537842, "learning_rate": 0.0002, "epoch": 5.412406271301977, "step": 3970}, {"loss": 1.0352, "grad_norm": 1.1001893281936646, "learning_rate": 0.0002, "epoch": 5.426039536468984, "step": 3980}, {"loss": 1.0855, "grad_norm": 1.2810745239257812, "learning_rate": 0.0002, "epoch": 5.439672801635992, "step": 3990}, {"loss": 1.0573, "grad_norm": 1.2999306917190552, "learning_rate": 0.0002, "epoch": 5.453306066802999, "step": 4000}, {"loss": 1.0073, "grad_norm": 1.172553300857544, "learning_rate": 0.0002, "epoch": 5.466939331970007, "step": 4010}, {"loss": 1.003, "grad_norm": 1.1483557224273682, "learning_rate": 0.0002, "epoch": 5.480572597137014, "step": 4020}, {"loss": 1.0704, "grad_norm": 1.4148036241531372, "learning_rate": 0.0002, "epoch": 5.494205862304022, "step": 4030}, {"loss": 1.0519, "grad_norm": 1.1611121892929077, "learning_rate": 0.0002, "epoch": 5.507839127471029, "step": 4040}, {"loss": 1.0775, "grad_norm": 1.3837119340896606, "learning_rate": 0.0002, "epoch": 5.521472392638037, "step": 4050}, {"loss": 1.0257, "grad_norm": 1.3025696277618408, "learning_rate": 0.0002, "epoch": 5.535105657805044, "step": 4060}, {"loss": 1.0628, "grad_norm": 1.348091959953308, "learning_rate": 0.0002, "epoch": 5.548738922972052, "step": 4070}, {"loss": 1.0447, "grad_norm": 1.3463449478149414, "learning_rate": 0.0002, "epoch": 5.56237218813906, "step": 4080}, {"loss": 1.039, "grad_norm": 1.3904176950454712, "learning_rate": 0.0002, "epoch": 5.576005453306067, "step": 4090}, {"loss": 1.0963, "grad_norm": 1.2737950086593628, "learning_rate": 0.0002, "epoch": 5.589638718473074, "step": 4100}, {"loss": 1.0441, "grad_norm": 1.3311827182769775, "learning_rate": 0.0002, "epoch": 5.603271983640082, "step": 4110}, {"loss": 1.0521, "grad_norm": 1.24485182762146, "learning_rate": 0.0002, "epoch": 5.61690524880709, "step": 4120}, {"loss": 1.1103, "grad_norm": 1.2724957466125488, "learning_rate": 0.0002, "epoch": 5.630538513974097, "step": 4130}, {"loss": 1.0588, "grad_norm": 1.3439847230911255, "learning_rate": 0.0002, "epoch": 5.644171779141105, "step": 4140}, {"loss": 1.0257, "grad_norm": 1.372359037399292, "learning_rate": 0.0002, "epoch": 5.657805044308112, "step": 4150}, {"loss": 1.0475, "grad_norm": 1.2322949171066284, "learning_rate": 0.0002, "epoch": 5.6714383094751195, "step": 4160}, {"loss": 1.0465, "grad_norm": 1.4859193563461304, "learning_rate": 0.0002, "epoch": 5.6850715746421265, "step": 4170}, {"loss": 1.1569, "grad_norm": 1.4318448305130005, "learning_rate": 0.0002, "epoch": 5.6987048398091344, "step": 4180}, {"loss": 1.017, "grad_norm": 1.1533565521240234, "learning_rate": 0.0002, "epoch": 5.7123381049761415, "step": 4190}, {"loss": 1.0948, "grad_norm": 1.3009696006774902, "learning_rate": 0.0002, "epoch": 5.725971370143149, "step": 4200}, {"loss": 1.1229, "grad_norm": 1.3972162008285522, "learning_rate": 0.0002, "epoch": 5.739604635310156, "step": 4210}, {"loss": 1.033, "grad_norm": 1.2142186164855957, "learning_rate": 0.0002, "epoch": 5.753237900477164, "step": 4220}, {"loss": 1.0588, "grad_norm": 1.401191234588623, "learning_rate": 0.0002, "epoch": 5.766871165644172, "step": 4230}, {"loss": 1.0722, "grad_norm": 1.4124404191970825, "learning_rate": 0.0002, "epoch": 5.780504430811179, "step": 4240}, {"loss": 1.0826, "grad_norm": 1.3488332033157349, "learning_rate": 0.0002, "epoch": 5.794137695978186, "step": 4250}, {"loss": 1.0599, "grad_norm": 1.3671752214431763, "learning_rate": 0.0002, "epoch": 5.807770961145194, "step": 4260}, {"loss": 1.1294, "grad_norm": 1.2608201503753662, "learning_rate": 0.0002, "epoch": 5.821404226312202, "step": 4270}, {"loss": 1.1216, "grad_norm": 1.1814045906066895, "learning_rate": 0.0002, "epoch": 5.835037491479209, "step": 4280}, {"loss": 1.0973, "grad_norm": 1.4139586687088013, "learning_rate": 0.0002, "epoch": 5.848670756646217, "step": 4290}, {"loss": 1.0656, "grad_norm": 1.34248948097229, "learning_rate": 0.0002, "epoch": 5.862304021813224, "step": 4300}, {"loss": 1.0791, "grad_norm": 1.1428139209747314, "learning_rate": 0.0002, "epoch": 5.875937286980232, "step": 4310}, {"loss": 1.0556, "grad_norm": 1.1941087245941162, "learning_rate": 0.0002, "epoch": 5.889570552147239, "step": 4320}, {"loss": 1.1089, "grad_norm": 1.2374001741409302, "learning_rate": 0.0002, "epoch": 5.903203817314247, "step": 4330}, {"loss": 1.0802, "grad_norm": 1.4314988851547241, "learning_rate": 0.0002, "epoch": 5.916837082481254, "step": 4340}, {"loss": 1.133, "grad_norm": 1.1286126375198364, "learning_rate": 0.0002, "epoch": 5.930470347648262, "step": 4350}, {"loss": 1.0807, "grad_norm": 1.25884211063385, "learning_rate": 0.0002, "epoch": 5.944103612815269, "step": 4360}, {"loss": 1.1189, "grad_norm": 1.223357915878296, "learning_rate": 0.0002, "epoch": 5.957736877982277, "step": 4370}, {"loss": 1.1335, "grad_norm": 1.2173810005187988, "learning_rate": 0.0002, "epoch": 5.971370143149285, "step": 4380}, {"loss": 1.1201, "grad_norm": 1.3152292966842651, "learning_rate": 0.0002, "epoch": 5.985003408316292, "step": 4390}, {"loss": 1.1456, "grad_norm": 1.5576739311218262, "learning_rate": 0.0002, "epoch": 5.998636673483299, "step": 4400}, {"eval_loss": 2.3435311317443848, "eval_runtime": 53.6362, "eval_samples_per_second": 9.453, "eval_steps_per_second": 1.193, "epoch": 6.0, "step": 4401}, {"loss": 0.8618, "grad_norm": 2.027981758117676, "learning_rate": 0.0002, "epoch": 6.012269938650307, "step": 4410}, {"loss": 0.7702, "grad_norm": 1.4775491952896118, "learning_rate": 0.0002, "epoch": 6.025903203817315, "step": 4420}, {"loss": 0.8042, "grad_norm": 1.6902967691421509, "learning_rate": 0.0002, "epoch": 6.039536468984322, "step": 4430}, {"loss": 0.7363, "grad_norm": 1.2506479024887085, "learning_rate": 0.0002, "epoch": 6.0531697341513295, "step": 4440}, {"loss": 0.7653, "grad_norm": 1.5935661792755127, "learning_rate": 0.0002, "epoch": 6.0668029993183366, "step": 4450}, {"loss": 0.7869, "grad_norm": 1.2966011762619019, "learning_rate": 0.0002, "epoch": 6.0804362644853445, "step": 4460}, {"loss": 0.7186, "grad_norm": 1.5247948169708252, "learning_rate": 0.0002, "epoch": 6.0940695296523515, "step": 4470}, {"loss": 0.7864, "grad_norm": 1.6415225267410278, "learning_rate": 0.0002, "epoch": 6.107702794819359, "step": 4480}, {"loss": 0.7561, "grad_norm": 1.5510778427124023, "learning_rate": 0.0002, "epoch": 6.121336059986366, "step": 4490}, {"loss": 0.7628, "grad_norm": 1.361097812652588, "learning_rate": 0.0002, "epoch": 6.134969325153374, "step": 4500}, {"loss": 0.8053, "grad_norm": 1.8347383737564087, "learning_rate": 0.0002, "epoch": 6.148602590320381, "step": 4510}, {"loss": 0.8074, "grad_norm": 1.570560097694397, "learning_rate": 0.0002, "epoch": 6.162235855487389, "step": 4520}, {"loss": 0.7536, "grad_norm": 1.517993688583374, "learning_rate": 0.0002, "epoch": 6.175869120654396, "step": 4530}, {"loss": 0.8028, "grad_norm": 1.4517489671707153, "learning_rate": 0.0002, "epoch": 6.189502385821404, "step": 4540}, {"loss": 0.8633, "grad_norm": 1.557098627090454, "learning_rate": 0.0002, "epoch": 6.203135650988412, "step": 4550}, {"loss": 0.7704, "grad_norm": 1.7379891872406006, "learning_rate": 0.0002, "epoch": 6.216768916155419, "step": 4560}, {"loss": 0.7341, "grad_norm": 2.2292542457580566, "learning_rate": 0.0002, "epoch": 6.230402181322427, "step": 4570}, {"loss": 0.7883, "grad_norm": 1.834366798400879, "learning_rate": 0.0002, "epoch": 6.244035446489434, "step": 4580}, {"loss": 0.8222, "grad_norm": 1.6755090951919556, "learning_rate": 0.0002, "epoch": 6.257668711656442, "step": 4590}, {"loss": 0.8245, "grad_norm": 1.828898549079895, "learning_rate": 0.0002, "epoch": 6.271301976823449, "step": 4600}, {"loss": 0.8116, "grad_norm": 1.9773457050323486, "learning_rate": 0.0002, "epoch": 6.284935241990457, "step": 4610}, {"loss": 0.811, "grad_norm": 1.533369541168213, "learning_rate": 0.0002, "epoch": 6.298568507157464, "step": 4620}, {"loss": 0.807, "grad_norm": 1.5432997941970825, "learning_rate": 0.0002, "epoch": 6.312201772324472, "step": 4630}, {"loss": 0.818, "grad_norm": 1.6686866283416748, "learning_rate": 0.0002, "epoch": 6.325835037491479, "step": 4640}, {"loss": 0.8656, "grad_norm": 1.545304298400879, "learning_rate": 0.0002, "epoch": 6.339468302658487, "step": 4650}, {"loss": 0.8239, "grad_norm": 1.5981945991516113, "learning_rate": 0.0002, "epoch": 6.353101567825494, "step": 4660}, {"loss": 0.8162, "grad_norm": 1.6973154544830322, "learning_rate": 0.0002, "epoch": 6.366734832992502, "step": 4670}, {"loss": 0.8377, "grad_norm": 1.6782612800598145, "learning_rate": 0.0002, "epoch": 6.38036809815951, "step": 4680}, {"loss": 0.8185, "grad_norm": 1.5710086822509766, "learning_rate": 0.0002, "epoch": 6.394001363326517, "step": 4690}, {"loss": 0.7948, "grad_norm": 1.7241147756576538, "learning_rate": 0.0002, "epoch": 6.407634628493525, "step": 4700}, {"loss": 0.8768, "grad_norm": 1.7736736536026, "learning_rate": 0.0002, "epoch": 6.421267893660532, "step": 4710}, {"loss": 0.8607, "grad_norm": 1.7924901247024536, "learning_rate": 0.0002, "epoch": 6.4349011588275395, "step": 4720}, {"loss": 0.832, "grad_norm": 1.4030500650405884, "learning_rate": 0.0002, "epoch": 6.448534423994547, "step": 4730}, {"loss": 0.8806, "grad_norm": 1.6925519704818726, "learning_rate": 0.0002, "epoch": 6.4621676891615545, "step": 4740}, {"loss": 0.8556, "grad_norm": 1.362905502319336, "learning_rate": 0.0002, "epoch": 6.4758009543285615, "step": 4750}, {"loss": 0.838, "grad_norm": 1.5281150341033936, "learning_rate": 0.0002, "epoch": 6.489434219495569, "step": 4760}, {"loss": 0.8396, "grad_norm": 1.524671196937561, "learning_rate": 0.0002, "epoch": 6.5030674846625764, "step": 4770}, {"loss": 0.8225, "grad_norm": 1.7029320001602173, "learning_rate": 0.0002, "epoch": 6.516700749829584, "step": 4780}, {"loss": 0.8377, "grad_norm": 1.4663511514663696, "learning_rate": 0.0002, "epoch": 6.530334014996591, "step": 4790}, {"loss": 0.8018, "grad_norm": 1.7682101726531982, "learning_rate": 0.0002, "epoch": 6.543967280163599, "step": 4800}, {"loss": 0.8318, "grad_norm": 1.6056565046310425, "learning_rate": 0.0002, "epoch": 6.557600545330606, "step": 4810}, {"loss": 0.8747, "grad_norm": 1.6552391052246094, "learning_rate": 0.0002, "epoch": 6.571233810497614, "step": 4820}, {"loss": 0.8559, "grad_norm": 1.4265215396881104, "learning_rate": 0.0002, "epoch": 6.584867075664622, "step": 4830}, {"loss": 0.8606, "grad_norm": 1.6225470304489136, "learning_rate": 0.0002, "epoch": 6.598500340831629, "step": 4840}, {"loss": 0.8817, "grad_norm": 1.6568684577941895, "learning_rate": 0.0002, "epoch": 6.612133605998636, "step": 4850}, {"loss": 0.8825, "grad_norm": 1.760115146636963, "learning_rate": 0.0002, "epoch": 6.625766871165644, "step": 4860}, {"loss": 0.9227, "grad_norm": 1.627966046333313, "learning_rate": 0.0002, "epoch": 6.639400136332652, "step": 4870}, {"loss": 0.8825, "grad_norm": 1.7053254842758179, "learning_rate": 0.0002, "epoch": 6.653033401499659, "step": 4880}, {"loss": 0.857, "grad_norm": 1.5339484214782715, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 4890}, {"loss": 0.8482, "grad_norm": 1.5594874620437622, "learning_rate": 0.0002, "epoch": 6.680299931833674, "step": 4900}, {"loss": 0.842, "grad_norm": 1.5322152376174927, "learning_rate": 0.0002, "epoch": 6.693933197000682, "step": 4910}, {"loss": 0.8049, "grad_norm": 1.733410358428955, "learning_rate": 0.0002, "epoch": 6.707566462167689, "step": 4920}, {"loss": 0.9099, "grad_norm": 1.3626887798309326, "learning_rate": 0.0002, "epoch": 6.721199727334697, "step": 4930}, {"loss": 0.9481, "grad_norm": 1.6323494911193848, "learning_rate": 0.0002, "epoch": 6.734832992501704, "step": 4940}, {"loss": 0.8803, "grad_norm": 1.6548917293548584, "learning_rate": 0.0002, "epoch": 6.748466257668712, "step": 4950}, {"loss": 0.9149, "grad_norm": 1.7894278764724731, "learning_rate": 0.0002, "epoch": 6.762099522835719, "step": 4960}, {"loss": 0.9137, "grad_norm": 1.7960841655731201, "learning_rate": 0.0002, "epoch": 6.775732788002727, "step": 4970}, {"loss": 0.9088, "grad_norm": 1.4888852834701538, "learning_rate": 0.0002, "epoch": 6.789366053169735, "step": 4980}, {"loss": 0.9495, "grad_norm": 1.6368865966796875, "learning_rate": 0.0002, "epoch": 6.802999318336742, "step": 4990}, {"loss": 0.9939, "grad_norm": 1.7106667757034302, "learning_rate": 0.0002, "epoch": 6.816632583503749, "step": 5000}, {"loss": 0.8551, "grad_norm": 4.131956100463867, "learning_rate": 0.0002, "epoch": 6.830265848670757, "step": 5010}, {"loss": 0.908, "grad_norm": 1.6357536315917969, "learning_rate": 0.0002, "epoch": 6.8438991138377645, "step": 5020}, {"loss": 0.8661, "grad_norm": 1.621524453163147, "learning_rate": 0.0002, "epoch": 6.8575323790047715, "step": 5030}, {"loss": 0.9177, "grad_norm": 1.6400790214538574, "learning_rate": 0.0002, "epoch": 6.871165644171779, "step": 5040}, {"loss": 0.9204, "grad_norm": 1.823006272315979, "learning_rate": 0.0002, "epoch": 6.8847989093387865, "step": 5050}, {"loss": 0.9133, "grad_norm": 1.6328210830688477, "learning_rate": 0.0002, "epoch": 6.898432174505794, "step": 5060}, {"loss": 0.9138, "grad_norm": 1.3616089820861816, "learning_rate": 0.0002, "epoch": 6.912065439672801, "step": 5070}, {"loss": 0.8791, "grad_norm": 1.7202986478805542, "learning_rate": 0.0002, "epoch": 6.925698704839809, "step": 5080}, {"loss": 0.8331, "grad_norm": 1.8145297765731812, "learning_rate": 0.0002, "epoch": 6.939331970006816, "step": 5090}, {"loss": 0.861, "grad_norm": 1.5432910919189453, "learning_rate": 0.0002, "epoch": 6.952965235173824, "step": 5100}, {"loss": 0.9282, "grad_norm": 1.2784099578857422, "learning_rate": 0.0002, "epoch": 6.966598500340831, "step": 5110}, {"loss": 0.9189, "grad_norm": 1.556593894958496, "learning_rate": 0.0002, "epoch": 6.980231765507839, "step": 5120}, {"loss": 0.8961, "grad_norm": 1.5102856159210205, "learning_rate": 0.0002, "epoch": 6.993865030674847, "step": 5130}]} +{"epoch": 7.994546693933197, "step": 5864, "epoch_duration": 1089.3252186775208, "total_accumulated_duration": 8756.72781586647, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-733", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0982, "grad_norm": 0.7714291214942932, "learning_rate": 0.0002, "epoch": 0.013633265167007498, "step": 10}, {"loss": 2.5206, "grad_norm": 0.5473978519439697, "learning_rate": 0.0002, "epoch": 0.027266530334014997, "step": 20}, {"loss": 2.3079, "grad_norm": 0.5452795624732971, "learning_rate": 0.0002, "epoch": 0.0408997955010225, "step": 30}, {"loss": 2.0019, "grad_norm": 0.5098028779029846, "learning_rate": 0.0002, "epoch": 0.054533060668029994, "step": 40}, {"loss": 1.9333, "grad_norm": 0.48062971234321594, "learning_rate": 0.0002, "epoch": 0.0681663258350375, "step": 50}, {"loss": 1.9355, "grad_norm": 0.4505695104598999, "learning_rate": 0.0002, "epoch": 0.081799591002045, "step": 60}, {"loss": 1.9312, "grad_norm": 0.41609591245651245, "learning_rate": 0.0002, "epoch": 0.09543285616905249, "step": 70}, {"loss": 1.8656, "grad_norm": 0.4323892593383789, "learning_rate": 0.0002, "epoch": 0.10906612133605999, "step": 80}, {"loss": 1.9294, "grad_norm": 0.4670293629169464, "learning_rate": 0.0002, "epoch": 0.12269938650306748, "step": 90}, {"loss": 1.7946, "grad_norm": 0.40623316168785095, "learning_rate": 0.0002, "epoch": 0.136332651670075, "step": 100}, {"loss": 1.8565, "grad_norm": 0.3620383143424988, "learning_rate": 0.0002, "epoch": 0.1499659168370825, "step": 110}, {"loss": 1.9238, "grad_norm": 0.332218736410141, "learning_rate": 0.0002, "epoch": 0.16359918200409, "step": 120}, {"loss": 1.93, "grad_norm": 0.4004521667957306, "learning_rate": 0.0002, "epoch": 0.17723244717109748, "step": 130}, {"loss": 1.7549, "grad_norm": 0.3698360323905945, "learning_rate": 0.0002, "epoch": 0.19086571233810498, "step": 140}, {"loss": 1.8771, "grad_norm": 0.3847949504852295, "learning_rate": 0.0002, "epoch": 0.20449897750511248, "step": 150}, {"loss": 1.8316, "grad_norm": 0.36843451857566833, "learning_rate": 0.0002, "epoch": 0.21813224267211997, "step": 160}, {"loss": 1.838, "grad_norm": 0.37301021814346313, "learning_rate": 0.0002, "epoch": 0.23176550783912747, "step": 170}, {"loss": 1.8909, "grad_norm": 0.3718886971473694, "learning_rate": 0.0002, "epoch": 0.24539877300613497, "step": 180}, {"loss": 1.8454, "grad_norm": 0.3088490962982178, "learning_rate": 0.0002, "epoch": 0.25903203817314246, "step": 190}, {"loss": 1.9254, "grad_norm": 0.3611852526664734, "learning_rate": 0.0002, "epoch": 0.27266530334015, "step": 200}, {"loss": 1.7844, "grad_norm": 0.36093324422836304, "learning_rate": 0.0002, "epoch": 0.28629856850715746, "step": 210}, {"loss": 1.719, "grad_norm": 0.3250400722026825, "learning_rate": 0.0002, "epoch": 0.299931833674165, "step": 220}, {"loss": 1.8729, "grad_norm": 0.3566756248474121, "learning_rate": 0.0002, "epoch": 0.31356509884117245, "step": 230}, {"loss": 1.9259, "grad_norm": 0.32872408628463745, "learning_rate": 0.0002, "epoch": 0.32719836400818, "step": 240}, {"loss": 1.9033, "grad_norm": 0.3983881175518036, "learning_rate": 0.0002, "epoch": 0.34083162917518744, "step": 250}, {"loss": 1.8588, "grad_norm": 0.3571510910987854, "learning_rate": 0.0002, "epoch": 0.35446489434219497, "step": 260}, {"loss": 1.8539, "grad_norm": 0.3036131262779236, "learning_rate": 0.0002, "epoch": 0.36809815950920244, "step": 270}, {"loss": 1.8572, "grad_norm": 0.36512863636016846, "learning_rate": 0.0002, "epoch": 0.38173142467620996, "step": 280}, {"loss": 1.8022, "grad_norm": 0.3429736793041229, "learning_rate": 0.0002, "epoch": 0.39536468984321743, "step": 290}, {"loss": 1.8754, "grad_norm": 0.3055964708328247, "learning_rate": 0.0002, "epoch": 0.40899795501022496, "step": 300}, {"loss": 1.8384, "grad_norm": 0.33801034092903137, "learning_rate": 0.0002, "epoch": 0.4226312201772324, "step": 310}, {"loss": 1.7933, "grad_norm": 0.348783016204834, "learning_rate": 0.0002, "epoch": 0.43626448534423995, "step": 320}, {"loss": 1.8451, "grad_norm": 0.3057514727115631, "learning_rate": 0.0002, "epoch": 0.4498977505112474, "step": 330}, {"loss": 1.8766, "grad_norm": 0.3849763572216034, "learning_rate": 0.0002, "epoch": 0.46353101567825494, "step": 340}, {"loss": 1.8073, "grad_norm": 0.30080053210258484, "learning_rate": 0.0002, "epoch": 0.47716428084526247, "step": 350}, {"loss": 1.8548, "grad_norm": 0.3595106303691864, "learning_rate": 0.0002, "epoch": 0.49079754601226994, "step": 360}, {"loss": 1.8232, "grad_norm": 0.31099820137023926, "learning_rate": 0.0002, "epoch": 0.5044308111792775, "step": 370}, {"loss": 1.7029, "grad_norm": 0.3157978355884552, "learning_rate": 0.0002, "epoch": 0.5180640763462849, "step": 380}, {"loss": 1.8265, "grad_norm": 0.27960965037345886, "learning_rate": 0.0002, "epoch": 0.5316973415132924, "step": 390}, {"loss": 1.7414, "grad_norm": 0.3102385103702545, "learning_rate": 0.0002, "epoch": 0.5453306066803, "step": 400}, {"loss": 1.7461, "grad_norm": 0.32828861474990845, "learning_rate": 0.0002, "epoch": 0.5589638718473074, "step": 410}, {"loss": 1.8165, "grad_norm": 0.29560017585754395, "learning_rate": 0.0002, "epoch": 0.5725971370143149, "step": 420}, {"loss": 1.9455, "grad_norm": 0.33316895365715027, "learning_rate": 0.0002, "epoch": 0.5862304021813224, "step": 430}, {"loss": 1.8241, "grad_norm": 0.30420982837677, "learning_rate": 0.0002, "epoch": 0.59986366734833, "step": 440}, {"loss": 1.7565, "grad_norm": 0.32619214057922363, "learning_rate": 0.0002, "epoch": 0.6134969325153374, "step": 450}, {"loss": 1.7945, "grad_norm": 0.3603750765323639, "learning_rate": 0.0002, "epoch": 0.6271301976823449, "step": 460}, {"loss": 1.7773, "grad_norm": 0.30834096670150757, "learning_rate": 0.0002, "epoch": 0.6407634628493524, "step": 470}, {"loss": 1.8058, "grad_norm": 0.28756365180015564, "learning_rate": 0.0002, "epoch": 0.65439672801636, "step": 480}, {"loss": 1.744, "grad_norm": 0.2878406345844269, "learning_rate": 0.0002, "epoch": 0.6680299931833674, "step": 490}, {"loss": 1.8581, "grad_norm": 0.31329697370529175, "learning_rate": 0.0002, "epoch": 0.6816632583503749, "step": 500}, {"loss": 1.7886, "grad_norm": 0.3405822515487671, "learning_rate": 0.0002, "epoch": 0.6952965235173824, "step": 510}, {"loss": 1.778, "grad_norm": 0.305560827255249, "learning_rate": 0.0002, "epoch": 0.7089297886843899, "step": 520}, {"loss": 1.7592, "grad_norm": 0.2973416745662689, "learning_rate": 0.0002, "epoch": 0.7225630538513974, "step": 530}, {"loss": 1.8223, "grad_norm": 0.327303946018219, "learning_rate": 0.0002, "epoch": 0.7361963190184049, "step": 540}, {"loss": 1.8591, "grad_norm": 0.62595534324646, "learning_rate": 0.0002, "epoch": 0.7498295841854125, "step": 550}, {"loss": 1.7466, "grad_norm": 0.3129784166812897, "learning_rate": 0.0002, "epoch": 0.7634628493524199, "step": 560}, {"loss": 1.8035, "grad_norm": 0.32496583461761475, "learning_rate": 0.0002, "epoch": 0.7770961145194274, "step": 570}, {"loss": 1.7787, "grad_norm": 0.3098868131637573, "learning_rate": 0.0002, "epoch": 0.7907293796864349, "step": 580}, {"loss": 1.7196, "grad_norm": 0.30726853013038635, "learning_rate": 0.0002, "epoch": 0.8043626448534424, "step": 590}, {"loss": 1.7898, "grad_norm": 0.2964220643043518, "learning_rate": 0.0002, "epoch": 0.8179959100204499, "step": 600}, {"loss": 1.8114, "grad_norm": 0.32352274656295776, "learning_rate": 0.0002, "epoch": 0.8316291751874574, "step": 610}, {"loss": 1.811, "grad_norm": 0.2938912510871887, "learning_rate": 0.0002, "epoch": 0.8452624403544649, "step": 620}, {"loss": 1.7727, "grad_norm": 0.295559823513031, "learning_rate": 0.0002, "epoch": 0.8588957055214724, "step": 630}, {"loss": 1.9, "grad_norm": 0.34102028608322144, "learning_rate": 0.0002, "epoch": 0.8725289706884799, "step": 640}, {"loss": 1.8006, "grad_norm": 0.29676181077957153, "learning_rate": 0.0002, "epoch": 0.8861622358554874, "step": 650}, {"loss": 1.8099, "grad_norm": 0.3108902871608734, "learning_rate": 0.0002, "epoch": 0.8997955010224948, "step": 660}, {"loss": 1.7955, "grad_norm": 0.2690821588039398, "learning_rate": 0.0002, "epoch": 0.9134287661895024, "step": 670}, {"loss": 1.7881, "grad_norm": 0.32752540707588196, "learning_rate": 0.0002, "epoch": 0.9270620313565099, "step": 680}, {"loss": 1.7661, "grad_norm": 0.8029476404190063, "learning_rate": 0.0002, "epoch": 0.9406952965235174, "step": 690}, {"loss": 1.7733, "grad_norm": 0.30534422397613525, "learning_rate": 0.0002, "epoch": 0.9543285616905249, "step": 700}, {"loss": 1.7614, "grad_norm": 0.2899954319000244, "learning_rate": 0.0002, "epoch": 0.9679618268575324, "step": 710}, {"loss": 1.7845, "grad_norm": 0.28814372420310974, "learning_rate": 0.0002, "epoch": 0.9815950920245399, "step": 720}, {"loss": 1.8865, "grad_norm": 0.3061596751213074, "learning_rate": 0.0002, "epoch": 0.9952283571915473, "step": 730}, {"eval_loss": 1.8171186447143555, "eval_runtime": 53.6047, "eval_samples_per_second": 9.458, "eval_steps_per_second": 1.194, "epoch": 0.9993183367416496, "step": 733}, {"loss": 1.6202, "grad_norm": 0.3140897750854492, "learning_rate": 0.0002, "epoch": 1.008861622358555, "step": 740}, {"loss": 1.8409, "grad_norm": 0.3346109390258789, "learning_rate": 0.0002, "epoch": 1.0224948875255624, "step": 750}, {"loss": 1.6777, "grad_norm": 0.3582976758480072, "learning_rate": 0.0002, "epoch": 1.0361281526925699, "step": 760}, {"loss": 1.7306, "grad_norm": 0.30408260226249695, "learning_rate": 0.0002, "epoch": 1.0497614178595773, "step": 770}, {"loss": 1.6967, "grad_norm": 0.323585569858551, "learning_rate": 0.0002, "epoch": 1.0633946830265848, "step": 780}, {"loss": 1.768, "grad_norm": 0.3474137783050537, "learning_rate": 0.0002, "epoch": 1.0770279481935923, "step": 790}, {"loss": 1.6895, "grad_norm": 0.35721147060394287, "learning_rate": 0.0002, "epoch": 1.0906612133606, "step": 800}, {"loss": 1.718, "grad_norm": 0.35366931557655334, "learning_rate": 0.0002, "epoch": 1.1042944785276074, "step": 810}, {"loss": 1.6797, "grad_norm": 0.3250770568847656, "learning_rate": 0.0002, "epoch": 1.117927743694615, "step": 820}, {"loss": 1.6383, "grad_norm": 0.3293766379356384, "learning_rate": 0.0002, "epoch": 1.1315610088616224, "step": 830}, {"loss": 1.7353, "grad_norm": 0.3380851745605469, "learning_rate": 0.0002, "epoch": 1.1451942740286298, "step": 840}, {"loss": 1.8236, "grad_norm": 0.32584455609321594, "learning_rate": 0.0002, "epoch": 1.1588275391956373, "step": 850}, {"loss": 1.6681, "grad_norm": 0.45700767636299133, "learning_rate": 0.0002, "epoch": 1.1724608043626448, "step": 860}, {"loss": 1.7494, "grad_norm": 0.30944544076919556, "learning_rate": 0.0002, "epoch": 1.1860940695296525, "step": 870}, {"loss": 1.7426, "grad_norm": 0.3268151581287384, "learning_rate": 0.0002, "epoch": 1.19972733469666, "step": 880}, {"loss": 1.7413, "grad_norm": 0.39972540736198425, "learning_rate": 0.0002, "epoch": 1.2133605998636674, "step": 890}, {"loss": 1.7481, "grad_norm": 0.7890929579734802, "learning_rate": 0.0002, "epoch": 1.2269938650306749, "step": 900}, {"loss": 1.7608, "grad_norm": 0.3439182639122009, "learning_rate": 0.0002, "epoch": 1.2406271301976823, "step": 910}, {"loss": 1.7617, "grad_norm": 0.3986225128173828, "learning_rate": 0.0002, "epoch": 1.2542603953646898, "step": 920}, {"loss": 1.6843, "grad_norm": 0.3514605164527893, "learning_rate": 0.0002, "epoch": 1.2678936605316973, "step": 930}, {"loss": 1.6987, "grad_norm": 0.3682589530944824, "learning_rate": 0.0002, "epoch": 1.2815269256987047, "step": 940}, {"loss": 1.6988, "grad_norm": 0.3618335723876953, "learning_rate": 0.0002, "epoch": 1.2951601908657122, "step": 950}, {"loss": 1.7436, "grad_norm": 0.345700740814209, "learning_rate": 0.0002, "epoch": 1.30879345603272, "step": 960}, {"loss": 1.7336, "grad_norm": 0.3514927923679352, "learning_rate": 0.0002, "epoch": 1.3224267211997274, "step": 970}, {"loss": 1.7704, "grad_norm": 0.365647554397583, "learning_rate": 0.0002, "epoch": 1.3360599863667348, "step": 980}, {"loss": 1.7104, "grad_norm": 0.3407285809516907, "learning_rate": 0.0002, "epoch": 1.3496932515337423, "step": 990}, {"loss": 1.7132, "grad_norm": 0.3785437345504761, "learning_rate": 0.0002, "epoch": 1.3633265167007498, "step": 1000}, {"loss": 1.766, "grad_norm": 0.34746724367141724, "learning_rate": 0.0002, "epoch": 1.3769597818677572, "step": 1010}, {"loss": 1.7252, "grad_norm": 0.362444132566452, "learning_rate": 0.0002, "epoch": 1.390593047034765, "step": 1020}, {"loss": 1.7132, "grad_norm": 0.4424704611301422, "learning_rate": 0.0002, "epoch": 1.4042263122017724, "step": 1030}, {"loss": 1.726, "grad_norm": 0.38722458481788635, "learning_rate": 0.0002, "epoch": 1.4178595773687799, "step": 1040}, {"loss": 1.7955, "grad_norm": 0.36089080572128296, "learning_rate": 0.0002, "epoch": 1.4314928425357873, "step": 1050}, {"loss": 1.6924, "grad_norm": 0.33817124366760254, "learning_rate": 0.0002, "epoch": 1.4451261077027948, "step": 1060}, {"loss": 1.7165, "grad_norm": 0.34334081411361694, "learning_rate": 0.0002, "epoch": 1.4587593728698023, "step": 1070}, {"loss": 1.6999, "grad_norm": 0.3776826858520508, "learning_rate": 0.0002, "epoch": 1.4723926380368098, "step": 1080}, {"loss": 1.7605, "grad_norm": 0.4169026017189026, "learning_rate": 0.0002, "epoch": 1.4860259032038172, "step": 1090}, {"loss": 1.7502, "grad_norm": 0.34898945689201355, "learning_rate": 0.0002, "epoch": 1.4996591683708247, "step": 1100}, {"loss": 1.635, "grad_norm": 0.34223780035972595, "learning_rate": 0.0002, "epoch": 1.5132924335378322, "step": 1110}, {"loss": 1.7248, "grad_norm": 0.3686901032924652, "learning_rate": 0.0002, "epoch": 1.5269256987048399, "step": 1120}, {"loss": 1.7525, "grad_norm": 0.35054415464401245, "learning_rate": 0.0002, "epoch": 1.5405589638718473, "step": 1130}, {"loss": 1.7776, "grad_norm": 0.39496365189552307, "learning_rate": 0.0002, "epoch": 1.5541922290388548, "step": 1140}, {"loss": 1.6574, "grad_norm": 0.35451626777648926, "learning_rate": 0.0002, "epoch": 1.5678254942058623, "step": 1150}, {"loss": 1.7257, "grad_norm": 0.3848083019256592, "learning_rate": 0.0002, "epoch": 1.58145875937287, "step": 1160}, {"loss": 1.7272, "grad_norm": 0.3760537803173065, "learning_rate": 0.0002, "epoch": 1.5950920245398774, "step": 1170}, {"loss": 1.7441, "grad_norm": 0.38981738686561584, "learning_rate": 0.0002, "epoch": 1.6087252897068849, "step": 1180}, {"loss": 1.6951, "grad_norm": 0.36830949783325195, "learning_rate": 0.0002, "epoch": 1.6223585548738924, "step": 1190}, {"loss": 1.6925, "grad_norm": 0.3405892848968506, "learning_rate": 0.0002, "epoch": 1.6359918200408998, "step": 1200}, {"loss": 1.7473, "grad_norm": 0.39027872681617737, "learning_rate": 0.0002, "epoch": 1.6496250852079073, "step": 1210}, {"loss": 1.6792, "grad_norm": 0.3342694044113159, "learning_rate": 0.0002, "epoch": 1.6632583503749148, "step": 1220}, {"loss": 1.7196, "grad_norm": 0.3600076735019684, "learning_rate": 0.0002, "epoch": 1.6768916155419222, "step": 1230}, {"loss": 1.7021, "grad_norm": 0.3625542223453522, "learning_rate": 0.0002, "epoch": 1.6905248807089297, "step": 1240}, {"loss": 1.6772, "grad_norm": 0.32170894742012024, "learning_rate": 0.0002, "epoch": 1.7041581458759372, "step": 1250}, {"loss": 1.7152, "grad_norm": 0.3544139862060547, "learning_rate": 0.0002, "epoch": 1.7177914110429446, "step": 1260}, {"loss": 1.7138, "grad_norm": 0.35113027691841125, "learning_rate": 0.0002, "epoch": 1.7314246762099523, "step": 1270}, {"loss": 1.7095, "grad_norm": 0.3499974310398102, "learning_rate": 0.0002, "epoch": 1.7450579413769598, "step": 1280}, {"loss": 1.7749, "grad_norm": 0.3285157382488251, "learning_rate": 0.0002, "epoch": 1.7586912065439673, "step": 1290}, {"loss": 1.6767, "grad_norm": 0.3701961636543274, "learning_rate": 0.0002, "epoch": 1.7723244717109747, "step": 1300}, {"loss": 1.6282, "grad_norm": 0.3301318287849426, "learning_rate": 0.0002, "epoch": 1.7859577368779824, "step": 1310}, {"loss": 1.7097, "grad_norm": 0.37801554799079895, "learning_rate": 0.0002, "epoch": 1.79959100204499, "step": 1320}, {"loss": 1.7437, "grad_norm": 0.3726748526096344, "learning_rate": 0.0002, "epoch": 1.8132242672119974, "step": 1330}, {"loss": 1.7959, "grad_norm": 0.4059790074825287, "learning_rate": 0.0002, "epoch": 1.8268575323790048, "step": 1340}, {"loss": 1.7739, "grad_norm": 0.35712096095085144, "learning_rate": 0.0002, "epoch": 1.8404907975460123, "step": 1350}, {"loss": 1.6375, "grad_norm": 0.35995328426361084, "learning_rate": 0.0002, "epoch": 1.8541240627130198, "step": 1360}, {"loss": 1.7332, "grad_norm": 0.3679947257041931, "learning_rate": 0.0002, "epoch": 1.8677573278800272, "step": 1370}, {"loss": 1.7587, "grad_norm": 0.39645957946777344, "learning_rate": 0.0002, "epoch": 1.8813905930470347, "step": 1380}, {"loss": 1.6985, "grad_norm": 0.35288700461387634, "learning_rate": 0.0002, "epoch": 1.8950238582140422, "step": 1390}, {"loss": 1.6582, "grad_norm": 0.32579198479652405, "learning_rate": 0.0002, "epoch": 1.9086571233810496, "step": 1400}, {"loss": 1.6948, "grad_norm": 0.3856561779975891, "learning_rate": 0.0002, "epoch": 1.9222903885480571, "step": 1410}, {"loss": 1.668, "grad_norm": 0.39019331336021423, "learning_rate": 0.0002, "epoch": 1.9359236537150648, "step": 1420}, {"loss": 1.7774, "grad_norm": 0.38006502389907837, "learning_rate": 0.0002, "epoch": 1.9495569188820723, "step": 1430}, {"loss": 1.8323, "grad_norm": 0.38100454211235046, "learning_rate": 0.0002, "epoch": 1.9631901840490797, "step": 1440}, {"loss": 1.7298, "grad_norm": 0.3405798673629761, "learning_rate": 0.0002, "epoch": 1.9768234492160872, "step": 1450}, {"loss": 1.7045, "grad_norm": 0.36582913994789124, "learning_rate": 0.0002, "epoch": 1.990456714383095, "step": 1460}, {"eval_loss": 1.8178424835205078, "eval_runtime": 53.6524, "eval_samples_per_second": 9.45, "eval_steps_per_second": 1.193, "epoch": 2.0, "step": 1467}, {"loss": 1.6363, "grad_norm": 0.3626647889614105, "learning_rate": 0.0002, "epoch": 2.0040899795501024, "step": 1470}, {"loss": 1.5354, "grad_norm": 0.40171775221824646, "learning_rate": 0.0002, "epoch": 2.01772324471711, "step": 1480}, {"loss": 1.5566, "grad_norm": 0.5805319547653198, "learning_rate": 0.0002, "epoch": 2.0313565098841173, "step": 1490}, {"loss": 1.546, "grad_norm": 0.41954153776168823, "learning_rate": 0.0002, "epoch": 2.044989775051125, "step": 1500}, {"loss": 1.6158, "grad_norm": 0.47190725803375244, "learning_rate": 0.0002, "epoch": 2.0586230402181322, "step": 1510}, {"loss": 1.5841, "grad_norm": 0.4388456344604492, "learning_rate": 0.0002, "epoch": 2.0722563053851397, "step": 1520}, {"loss": 1.5835, "grad_norm": 2.2171926498413086, "learning_rate": 0.0002, "epoch": 2.085889570552147, "step": 1530}, {"loss": 1.6137, "grad_norm": 0.4314221143722534, "learning_rate": 0.0002, "epoch": 2.0995228357191547, "step": 1540}, {"loss": 1.5511, "grad_norm": 0.4154265522956848, "learning_rate": 0.0002, "epoch": 2.113156100886162, "step": 1550}, {"loss": 1.6323, "grad_norm": 0.5025539994239807, "learning_rate": 0.0002, "epoch": 2.1267893660531696, "step": 1560}, {"loss": 1.5903, "grad_norm": 0.5410493016242981, "learning_rate": 0.0002, "epoch": 2.140422631220177, "step": 1570}, {"loss": 1.507, "grad_norm": 0.4478487968444824, "learning_rate": 0.0002, "epoch": 2.1540558963871845, "step": 1580}, {"loss": 1.5536, "grad_norm": 0.4703652560710907, "learning_rate": 0.0002, "epoch": 2.1676891615541924, "step": 1590}, {"loss": 1.5991, "grad_norm": 0.4555390179157257, "learning_rate": 0.0002, "epoch": 2.1813224267212, "step": 1600}, {"loss": 1.6117, "grad_norm": 0.4877263903617859, "learning_rate": 0.0002, "epoch": 2.1949556918882074, "step": 1610}, {"loss": 1.5928, "grad_norm": 0.48708245158195496, "learning_rate": 0.0002, "epoch": 2.208588957055215, "step": 1620}, {"loss": 1.6106, "grad_norm": 0.47523951530456543, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 1630}, {"loss": 1.6013, "grad_norm": 0.4889199733734131, "learning_rate": 0.0002, "epoch": 2.23585548738923, "step": 1640}, {"loss": 1.6633, "grad_norm": 0.4585252106189728, "learning_rate": 0.0002, "epoch": 2.2494887525562373, "step": 1650}, {"loss": 1.6075, "grad_norm": 0.4764868915081024, "learning_rate": 0.0002, "epoch": 2.2631220177232447, "step": 1660}, {"loss": 1.6427, "grad_norm": 0.5028976202011108, "learning_rate": 0.0002, "epoch": 2.276755282890252, "step": 1670}, {"loss": 1.6258, "grad_norm": 0.46131211519241333, "learning_rate": 0.0002, "epoch": 2.2903885480572597, "step": 1680}, {"loss": 1.654, "grad_norm": 0.5422874689102173, "learning_rate": 0.0002, "epoch": 2.304021813224267, "step": 1690}, {"loss": 1.6331, "grad_norm": 0.47615355253219604, "learning_rate": 0.0002, "epoch": 2.3176550783912746, "step": 1700}, {"loss": 1.642, "grad_norm": 0.48005548119544983, "learning_rate": 0.0002, "epoch": 2.331288343558282, "step": 1710}, {"loss": 1.581, "grad_norm": 0.4387182295322418, "learning_rate": 0.0002, "epoch": 2.3449216087252895, "step": 1720}, {"loss": 1.5612, "grad_norm": 0.4487272799015045, "learning_rate": 0.0002, "epoch": 2.358554873892297, "step": 1730}, {"loss": 1.5089, "grad_norm": 0.5046455264091492, "learning_rate": 0.0002, "epoch": 2.372188139059305, "step": 1740}, {"loss": 1.5769, "grad_norm": 0.4653521180152893, "learning_rate": 0.0002, "epoch": 2.3858214042263124, "step": 1750}, {"loss": 1.6201, "grad_norm": 0.4737723469734192, "learning_rate": 0.0002, "epoch": 2.39945466939332, "step": 1760}, {"loss": 1.5933, "grad_norm": 0.4501931071281433, "learning_rate": 0.0002, "epoch": 2.4130879345603273, "step": 1770}, {"loss": 1.6321, "grad_norm": 0.4772880971431732, "learning_rate": 0.0002, "epoch": 2.426721199727335, "step": 1780}, {"loss": 1.5454, "grad_norm": 0.4544616937637329, "learning_rate": 0.0002, "epoch": 2.4403544648943423, "step": 1790}, {"loss": 1.5501, "grad_norm": 0.488313227891922, "learning_rate": 0.0002, "epoch": 2.4539877300613497, "step": 1800}, {"loss": 1.5791, "grad_norm": 0.5057830214500427, "learning_rate": 0.0002, "epoch": 2.467620995228357, "step": 1810}, {"loss": 1.5645, "grad_norm": 0.5049484968185425, "learning_rate": 0.0002, "epoch": 2.4812542603953647, "step": 1820}, {"loss": 1.6268, "grad_norm": 0.44966644048690796, "learning_rate": 0.0002, "epoch": 2.494887525562372, "step": 1830}, {"loss": 1.5941, "grad_norm": 0.5072630643844604, "learning_rate": 0.0002, "epoch": 2.5085207907293796, "step": 1840}, {"loss": 1.5251, "grad_norm": 0.43989792466163635, "learning_rate": 0.0002, "epoch": 2.522154055896387, "step": 1850}, {"loss": 1.563, "grad_norm": 1.3504403829574585, "learning_rate": 0.0002, "epoch": 2.5357873210633946, "step": 1860}, {"loss": 1.5681, "grad_norm": 0.46545976400375366, "learning_rate": 0.0002, "epoch": 2.549420586230402, "step": 1870}, {"loss": 1.6368, "grad_norm": 0.4678342044353485, "learning_rate": 0.0002, "epoch": 2.5630538513974095, "step": 1880}, {"loss": 1.5814, "grad_norm": 0.529755711555481, "learning_rate": 0.0002, "epoch": 2.5766871165644174, "step": 1890}, {"loss": 1.5861, "grad_norm": 0.5000199675559998, "learning_rate": 0.0002, "epoch": 2.5903203817314244, "step": 1900}, {"loss": 1.6346, "grad_norm": 0.5649300217628479, "learning_rate": 0.0002, "epoch": 2.6039536468984323, "step": 1910}, {"loss": 1.6317, "grad_norm": 0.7920585870742798, "learning_rate": 0.0002, "epoch": 2.61758691206544, "step": 1920}, {"loss": 1.643, "grad_norm": 0.4960342049598694, "learning_rate": 0.0002, "epoch": 2.6312201772324473, "step": 1930}, {"loss": 1.6099, "grad_norm": 0.5324710011482239, "learning_rate": 0.0002, "epoch": 2.6448534423994547, "step": 1940}, {"loss": 1.5874, "grad_norm": 0.606343150138855, "learning_rate": 0.0002, "epoch": 2.658486707566462, "step": 1950}, {"loss": 1.5728, "grad_norm": 0.53038489818573, "learning_rate": 0.0002, "epoch": 2.6721199727334697, "step": 1960}, {"loss": 1.5583, "grad_norm": 0.4579465091228485, "learning_rate": 0.0002, "epoch": 2.685753237900477, "step": 1970}, {"loss": 1.6093, "grad_norm": 0.4541707932949066, "learning_rate": 0.0002, "epoch": 2.6993865030674846, "step": 1980}, {"loss": 1.5316, "grad_norm": 0.5009395480155945, "learning_rate": 0.0002, "epoch": 2.713019768234492, "step": 1990}, {"loss": 1.6724, "grad_norm": 0.4723006784915924, "learning_rate": 0.0002, "epoch": 2.7266530334014996, "step": 2000}, {"loss": 1.638, "grad_norm": 0.5086126923561096, "learning_rate": 0.0002, "epoch": 2.740286298568507, "step": 2010}, {"loss": 1.6223, "grad_norm": 0.47242608666419983, "learning_rate": 0.0002, "epoch": 2.7539195637355145, "step": 2020}, {"loss": 1.6242, "grad_norm": 0.44922566413879395, "learning_rate": 0.0002, "epoch": 2.767552828902522, "step": 2030}, {"loss": 1.6837, "grad_norm": 0.420259565114975, "learning_rate": 0.0002, "epoch": 2.78118609406953, "step": 2040}, {"loss": 1.5612, "grad_norm": 0.4762881100177765, "learning_rate": 0.0002, "epoch": 2.794819359236537, "step": 2050}, {"loss": 1.5506, "grad_norm": 0.5228786468505859, "learning_rate": 0.0002, "epoch": 2.808452624403545, "step": 2060}, {"loss": 1.6347, "grad_norm": 0.4796035587787628, "learning_rate": 0.0002, "epoch": 2.8220858895705523, "step": 2070}, {"loss": 1.6843, "grad_norm": 0.5034735202789307, "learning_rate": 0.0002, "epoch": 2.8357191547375598, "step": 2080}, {"loss": 1.6455, "grad_norm": 0.48005399107933044, "learning_rate": 0.0002, "epoch": 2.8493524199045672, "step": 2090}, {"loss": 1.6287, "grad_norm": 0.578820526599884, "learning_rate": 0.0002, "epoch": 2.8629856850715747, "step": 2100}, {"loss": 1.6021, "grad_norm": 0.48982638120651245, "learning_rate": 0.0002, "epoch": 2.876618950238582, "step": 2110}, {"loss": 1.5769, "grad_norm": 0.5157325863838196, "learning_rate": 0.0002, "epoch": 2.8902522154055896, "step": 2120}, {"loss": 1.6089, "grad_norm": 0.49149683117866516, "learning_rate": 0.0002, "epoch": 2.903885480572597, "step": 2130}, {"loss": 1.5881, "grad_norm": 0.48584499955177307, "learning_rate": 0.0002, "epoch": 2.9175187457396046, "step": 2140}, {"loss": 1.5833, "grad_norm": 0.5199017524719238, "learning_rate": 0.0002, "epoch": 2.931152010906612, "step": 2150}, {"loss": 1.7344, "grad_norm": 0.5788236856460571, "learning_rate": 0.0002, "epoch": 2.9447852760736195, "step": 2160}, {"loss": 1.6103, "grad_norm": 0.48664185404777527, "learning_rate": 0.0002, "epoch": 2.958418541240627, "step": 2170}, {"loss": 1.5765, "grad_norm": 0.5026682615280151, "learning_rate": 0.0002, "epoch": 2.9720518064076344, "step": 2180}, {"loss": 1.6626, "grad_norm": 0.49317044019699097, "learning_rate": 0.0002, "epoch": 2.9856850715746424, "step": 2190}, {"loss": 1.5871, "grad_norm": 0.5729128122329712, "learning_rate": 0.0002, "epoch": 2.9993183367416494, "step": 2200}, {"eval_loss": 1.8527295589447021, "eval_runtime": 53.6403, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 2.9993183367416494, "step": 2200}, {"loss": 1.4719, "grad_norm": 0.5530241131782532, "learning_rate": 0.0002, "epoch": 3.0129516019086573, "step": 2210}, {"loss": 1.4088, "grad_norm": 0.6642216444015503, "learning_rate": 0.0002, "epoch": 3.0265848670756648, "step": 2220}, {"loss": 1.4382, "grad_norm": 0.61470627784729, "learning_rate": 0.0002, "epoch": 3.0402181322426722, "step": 2230}, {"loss": 1.4634, "grad_norm": 0.8559566140174866, "learning_rate": 0.0002, "epoch": 3.0538513974096797, "step": 2240}, {"loss": 1.3854, "grad_norm": 0.7015801668167114, "learning_rate": 0.0002, "epoch": 3.067484662576687, "step": 2250}, {"loss": 1.4981, "grad_norm": 0.7226442694664001, "learning_rate": 0.0002, "epoch": 3.0811179277436946, "step": 2260}, {"loss": 1.4143, "grad_norm": 0.7560588717460632, "learning_rate": 0.0002, "epoch": 3.094751192910702, "step": 2270}, {"loss": 1.4395, "grad_norm": 0.6216568946838379, "learning_rate": 0.0002, "epoch": 3.1083844580777096, "step": 2280}, {"loss": 1.3842, "grad_norm": 0.6768500804901123, "learning_rate": 0.0002, "epoch": 3.122017723244717, "step": 2290}, {"loss": 1.4672, "grad_norm": 0.7028762102127075, "learning_rate": 0.0002, "epoch": 3.1356509884117245, "step": 2300}, {"loss": 1.3826, "grad_norm": 0.6329697966575623, "learning_rate": 0.0002, "epoch": 3.149284253578732, "step": 2310}, {"loss": 1.442, "grad_norm": 0.6328264474868774, "learning_rate": 0.0002, "epoch": 3.1629175187457395, "step": 2320}, {"loss": 1.3762, "grad_norm": 0.7573632001876831, "learning_rate": 0.0002, "epoch": 3.176550783912747, "step": 2330}, {"loss": 1.3553, "grad_norm": 0.595740795135498, "learning_rate": 0.0002, "epoch": 3.190184049079755, "step": 2340}, {"loss": 1.3953, "grad_norm": 0.7111806869506836, "learning_rate": 0.0002, "epoch": 3.2038173142467623, "step": 2350}, {"loss": 1.3797, "grad_norm": 0.6328730583190918, "learning_rate": 0.0002, "epoch": 3.2174505794137698, "step": 2360}, {"loss": 1.3855, "grad_norm": 0.5860254168510437, "learning_rate": 0.0002, "epoch": 3.2310838445807772, "step": 2370}, {"loss": 1.4267, "grad_norm": 0.7387157082557678, "learning_rate": 0.0002, "epoch": 3.2447171097477847, "step": 2380}, {"loss": 1.4837, "grad_norm": 0.6897673606872559, "learning_rate": 0.0002, "epoch": 3.258350374914792, "step": 2390}, {"loss": 1.4372, "grad_norm": 0.7157699465751648, "learning_rate": 0.0002, "epoch": 3.2719836400817996, "step": 2400}, {"loss": 1.4432, "grad_norm": 0.6422511339187622, "learning_rate": 0.0002, "epoch": 3.285616905248807, "step": 2410}, {"loss": 1.4828, "grad_norm": 1.0481886863708496, "learning_rate": 0.0002, "epoch": 3.2992501704158146, "step": 2420}, {"loss": 1.4473, "grad_norm": 0.7050786018371582, "learning_rate": 0.0002, "epoch": 3.312883435582822, "step": 2430}, {"loss": 1.3465, "grad_norm": 0.6090759038925171, "learning_rate": 0.0002, "epoch": 3.3265167007498295, "step": 2440}, {"loss": 1.4619, "grad_norm": 0.6626465320587158, "learning_rate": 0.0002, "epoch": 3.340149965916837, "step": 2450}, {"loss": 1.4512, "grad_norm": 0.6565486788749695, "learning_rate": 0.0002, "epoch": 3.3537832310838445, "step": 2460}, {"loss": 1.588, "grad_norm": 0.6449528932571411, "learning_rate": 0.0002, "epoch": 3.367416496250852, "step": 2470}, {"loss": 1.4773, "grad_norm": 0.7746227383613586, "learning_rate": 0.0002, "epoch": 3.3810497614178594, "step": 2480}, {"loss": 1.417, "grad_norm": 0.7074846029281616, "learning_rate": 0.0002, "epoch": 3.3946830265848673, "step": 2490}, {"loss": 1.4476, "grad_norm": 0.6547690033912659, "learning_rate": 0.0002, "epoch": 3.4083162917518743, "step": 2500}, {"loss": 1.4074, "grad_norm": 0.784721314907074, "learning_rate": 0.0002, "epoch": 3.4219495569188823, "step": 2510}, {"loss": 1.4326, "grad_norm": 0.7270277738571167, "learning_rate": 0.0002, "epoch": 3.4355828220858897, "step": 2520}, {"loss": 1.4354, "grad_norm": 0.67588871717453, "learning_rate": 0.0002, "epoch": 3.449216087252897, "step": 2530}, {"loss": 1.4074, "grad_norm": 0.6768023371696472, "learning_rate": 0.0002, "epoch": 3.4628493524199047, "step": 2540}, {"loss": 1.4863, "grad_norm": 0.7026481628417969, "learning_rate": 0.0002, "epoch": 3.476482617586912, "step": 2550}, {"loss": 1.468, "grad_norm": 0.646075963973999, "learning_rate": 0.0002, "epoch": 3.4901158827539196, "step": 2560}, {"loss": 1.4058, "grad_norm": 0.6288973689079285, "learning_rate": 0.0002, "epoch": 3.503749147920927, "step": 2570}, {"loss": 1.4613, "grad_norm": 0.6440825462341309, "learning_rate": 0.0002, "epoch": 3.5173824130879345, "step": 2580}, {"loss": 1.3808, "grad_norm": 0.7074111700057983, "learning_rate": 0.0002, "epoch": 3.531015678254942, "step": 2590}, {"loss": 1.4901, "grad_norm": 0.7007562518119812, "learning_rate": 0.0002, "epoch": 3.5446489434219495, "step": 2600}, {"loss": 1.4511, "grad_norm": 0.6045376658439636, "learning_rate": 0.0002, "epoch": 3.558282208588957, "step": 2610}, {"loss": 1.4596, "grad_norm": 0.9149952530860901, "learning_rate": 0.0002, "epoch": 3.5719154737559644, "step": 2620}, {"loss": 1.4355, "grad_norm": 0.6490362882614136, "learning_rate": 0.0002, "epoch": 3.585548738922972, "step": 2630}, {"loss": 1.4107, "grad_norm": 0.6552226543426514, "learning_rate": 0.0002, "epoch": 3.59918200408998, "step": 2640}, {"loss": 1.433, "grad_norm": 0.6541850566864014, "learning_rate": 0.0002, "epoch": 3.612815269256987, "step": 2650}, {"loss": 1.4279, "grad_norm": 0.6500770449638367, "learning_rate": 0.0002, "epoch": 3.6264485344239947, "step": 2660}, {"loss": 1.3929, "grad_norm": 0.6345893740653992, "learning_rate": 0.0002, "epoch": 3.640081799591002, "step": 2670}, {"loss": 1.3634, "grad_norm": 0.6382275223731995, "learning_rate": 0.0002, "epoch": 3.6537150647580097, "step": 2680}, {"loss": 1.4478, "grad_norm": 0.6738566160202026, "learning_rate": 0.0002, "epoch": 3.667348329925017, "step": 2690}, {"loss": 1.4642, "grad_norm": 0.7446315288543701, "learning_rate": 0.0002, "epoch": 3.6809815950920246, "step": 2700}, {"loss": 1.4342, "grad_norm": 0.6717571020126343, "learning_rate": 0.0002, "epoch": 3.694614860259032, "step": 2710}, {"loss": 1.4285, "grad_norm": 0.667259693145752, "learning_rate": 0.0002, "epoch": 3.7082481254260395, "step": 2720}, {"loss": 1.5389, "grad_norm": 0.6808622479438782, "learning_rate": 0.0002, "epoch": 3.721881390593047, "step": 2730}, {"loss": 1.4297, "grad_norm": 0.7254287004470825, "learning_rate": 0.0002, "epoch": 3.7355146557600545, "step": 2740}, {"loss": 1.4176, "grad_norm": 0.6864007711410522, "learning_rate": 0.0002, "epoch": 3.749147920927062, "step": 2750}, {"loss": 1.4811, "grad_norm": 0.7041361331939697, "learning_rate": 0.0002, "epoch": 3.7627811860940694, "step": 2760}, {"loss": 1.4284, "grad_norm": 0.6559903025627136, "learning_rate": 0.0002, "epoch": 3.776414451261077, "step": 2770}, {"loss": 1.4608, "grad_norm": 0.6602269411087036, "learning_rate": 0.0002, "epoch": 3.7900477164280844, "step": 2780}, {"loss": 1.4588, "grad_norm": 0.692611813545227, "learning_rate": 0.0002, "epoch": 3.8036809815950923, "step": 2790}, {"loss": 1.4065, "grad_norm": 0.7051475644111633, "learning_rate": 0.0002, "epoch": 3.8173142467620993, "step": 2800}, {"loss": 1.4083, "grad_norm": 0.6685371398925781, "learning_rate": 0.0002, "epoch": 3.830947511929107, "step": 2810}, {"loss": 1.5227, "grad_norm": 0.6706477403640747, "learning_rate": 0.0002, "epoch": 3.8445807770961147, "step": 2820}, {"loss": 1.4076, "grad_norm": 0.6671637296676636, "learning_rate": 0.0002, "epoch": 3.858214042263122, "step": 2830}, {"loss": 1.4736, "grad_norm": 0.694092333316803, "learning_rate": 0.0002, "epoch": 3.8718473074301296, "step": 2840}, {"loss": 1.4161, "grad_norm": 0.7349600195884705, "learning_rate": 0.0002, "epoch": 3.885480572597137, "step": 2850}, {"loss": 1.4617, "grad_norm": 0.6647971868515015, "learning_rate": 0.0002, "epoch": 3.8991138377641446, "step": 2860}, {"loss": 1.5046, "grad_norm": 0.806656539440155, "learning_rate": 0.0002, "epoch": 3.912747102931152, "step": 2870}, {"loss": 1.428, "grad_norm": 0.6008772850036621, "learning_rate": 0.0002, "epoch": 3.9263803680981595, "step": 2880}, {"loss": 1.4116, "grad_norm": 0.659227728843689, "learning_rate": 0.0002, "epoch": 3.940013633265167, "step": 2890}, {"loss": 1.4136, "grad_norm": 0.6357656717300415, "learning_rate": 0.0002, "epoch": 3.9536468984321744, "step": 2900}, {"loss": 1.4655, "grad_norm": 0.6541687846183777, "learning_rate": 0.0002, "epoch": 3.967280163599182, "step": 2910}, {"loss": 1.4854, "grad_norm": 0.6090909838676453, "learning_rate": 0.0002, "epoch": 3.9809134287661894, "step": 2920}, {"loss": 1.4615, "grad_norm": 0.7198411822319031, "learning_rate": 0.0002, "epoch": 3.994546693933197, "step": 2930}, {"eval_loss": 1.9278366565704346, "eval_runtime": 53.6567, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.0, "step": 2934}, {"loss": 1.3159, "grad_norm": 0.6498575210571289, "learning_rate": 0.0002, "epoch": 4.008179959100205, "step": 2940}, {"loss": 1.2075, "grad_norm": 0.865602433681488, "learning_rate": 0.0002, "epoch": 4.021813224267212, "step": 2950}, {"loss": 1.1744, "grad_norm": 0.8514999151229858, "learning_rate": 0.0002, "epoch": 4.03544648943422, "step": 2960}, {"loss": 1.1553, "grad_norm": 1.0677322149276733, "learning_rate": 0.0002, "epoch": 4.049079754601227, "step": 2970}, {"loss": 1.1962, "grad_norm": 1.0126488208770752, "learning_rate": 0.0002, "epoch": 4.062713019768235, "step": 2980}, {"loss": 1.1631, "grad_norm": 1.0008870363235474, "learning_rate": 0.0002, "epoch": 4.076346284935242, "step": 2990}, {"loss": 1.2154, "grad_norm": 0.7942054271697998, "learning_rate": 0.0002, "epoch": 4.08997955010225, "step": 3000}, {"loss": 1.214, "grad_norm": 1.0482100248336792, "learning_rate": 0.0002, "epoch": 4.103612815269257, "step": 3010}, {"loss": 1.1999, "grad_norm": 1.0516992807388306, "learning_rate": 0.0002, "epoch": 4.1172460804362645, "step": 3020}, {"loss": 1.2108, "grad_norm": 0.8144322037696838, "learning_rate": 0.0002, "epoch": 4.130879345603272, "step": 3030}, {"loss": 1.1782, "grad_norm": 0.952297568321228, "learning_rate": 0.0002, "epoch": 4.144512610770279, "step": 3040}, {"loss": 1.2814, "grad_norm": 1.007645606994629, "learning_rate": 0.0002, "epoch": 4.158145875937287, "step": 3050}, {"loss": 1.1731, "grad_norm": 1.0480353832244873, "learning_rate": 0.0002, "epoch": 4.171779141104294, "step": 3060}, {"loss": 1.196, "grad_norm": 0.9270663857460022, "learning_rate": 0.0002, "epoch": 4.185412406271302, "step": 3070}, {"loss": 1.2167, "grad_norm": 1.3415262699127197, "learning_rate": 0.0002, "epoch": 4.199045671438309, "step": 3080}, {"loss": 1.2601, "grad_norm": 1.167606234550476, "learning_rate": 0.0002, "epoch": 4.212678936605317, "step": 3090}, {"loss": 1.2605, "grad_norm": 0.9418690800666809, "learning_rate": 0.0002, "epoch": 4.226312201772324, "step": 3100}, {"loss": 1.2184, "grad_norm": 1.0885876417160034, "learning_rate": 0.0002, "epoch": 4.239945466939332, "step": 3110}, {"loss": 1.2594, "grad_norm": 0.9165483713150024, "learning_rate": 0.0002, "epoch": 4.253578732106339, "step": 3120}, {"loss": 1.2933, "grad_norm": 0.9154694080352783, "learning_rate": 0.0002, "epoch": 4.267211997273347, "step": 3130}, {"loss": 1.2584, "grad_norm": 1.100580096244812, "learning_rate": 0.0002, "epoch": 4.280845262440354, "step": 3140}, {"loss": 1.251, "grad_norm": 0.9367576241493225, "learning_rate": 0.0002, "epoch": 4.294478527607362, "step": 3150}, {"loss": 1.2032, "grad_norm": 0.9744015336036682, "learning_rate": 0.0002, "epoch": 4.308111792774369, "step": 3160}, {"loss": 1.2787, "grad_norm": 0.9865175485610962, "learning_rate": 0.0002, "epoch": 4.321745057941377, "step": 3170}, {"loss": 1.2161, "grad_norm": 1.0124907493591309, "learning_rate": 0.0002, "epoch": 4.335378323108385, "step": 3180}, {"loss": 1.2452, "grad_norm": 1.1044819355010986, "learning_rate": 0.0002, "epoch": 4.349011588275392, "step": 3190}, {"loss": 1.2483, "grad_norm": 0.9305577278137207, "learning_rate": 0.0002, "epoch": 4.3626448534424, "step": 3200}, {"loss": 1.2101, "grad_norm": 0.969265341758728, "learning_rate": 0.0002, "epoch": 4.376278118609407, "step": 3210}, {"loss": 1.2355, "grad_norm": 1.0671923160552979, "learning_rate": 0.0002, "epoch": 4.389911383776415, "step": 3220}, {"loss": 1.2259, "grad_norm": 0.9440539479255676, "learning_rate": 0.0002, "epoch": 4.403544648943422, "step": 3230}, {"loss": 1.1706, "grad_norm": 0.9824562668800354, "learning_rate": 0.0002, "epoch": 4.41717791411043, "step": 3240}, {"loss": 1.2234, "grad_norm": 1.0245535373687744, "learning_rate": 0.0002, "epoch": 4.430811179277437, "step": 3250}, {"loss": 1.2713, "grad_norm": 0.9629312753677368, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 3260}, {"loss": 1.2689, "grad_norm": 1.1556470394134521, "learning_rate": 0.0002, "epoch": 4.458077709611452, "step": 3270}, {"loss": 1.2214, "grad_norm": 0.9796679019927979, "learning_rate": 0.0002, "epoch": 4.47171097477846, "step": 3280}, {"loss": 1.2823, "grad_norm": 0.9030535221099854, "learning_rate": 0.0002, "epoch": 4.485344239945467, "step": 3290}, {"loss": 1.2111, "grad_norm": 0.9142820835113525, "learning_rate": 0.0002, "epoch": 4.4989775051124745, "step": 3300}, {"loss": 1.2398, "grad_norm": 0.966867208480835, "learning_rate": 0.0002, "epoch": 4.5126107702794815, "step": 3310}, {"loss": 1.2537, "grad_norm": 1.0127079486846924, "learning_rate": 0.0002, "epoch": 4.5262440354464895, "step": 3320}, {"loss": 1.2059, "grad_norm": 1.055506706237793, "learning_rate": 0.0002, "epoch": 4.539877300613497, "step": 3330}, {"loss": 1.2958, "grad_norm": 0.9831468462944031, "learning_rate": 0.0002, "epoch": 4.553510565780504, "step": 3340}, {"loss": 1.2643, "grad_norm": 0.9304661154747009, "learning_rate": 0.0002, "epoch": 4.567143830947512, "step": 3350}, {"loss": 1.3621, "grad_norm": 0.9369107484817505, "learning_rate": 0.0002, "epoch": 4.580777096114519, "step": 3360}, {"loss": 1.2301, "grad_norm": 1.009506344795227, "learning_rate": 0.0002, "epoch": 4.594410361281527, "step": 3370}, {"loss": 1.2535, "grad_norm": 1.0575741529464722, "learning_rate": 0.0002, "epoch": 4.608043626448534, "step": 3380}, {"loss": 1.1914, "grad_norm": 0.9102860689163208, "learning_rate": 0.0002, "epoch": 4.621676891615542, "step": 3390}, {"loss": 1.3156, "grad_norm": 0.8111315965652466, "learning_rate": 0.0002, "epoch": 4.635310156782549, "step": 3400}, {"loss": 1.3103, "grad_norm": 0.9459649920463562, "learning_rate": 0.0002, "epoch": 4.648943421949557, "step": 3410}, {"loss": 1.3146, "grad_norm": 0.9709545969963074, "learning_rate": 0.0002, "epoch": 4.662576687116564, "step": 3420}, {"loss": 1.2958, "grad_norm": 0.9909247159957886, "learning_rate": 0.0002, "epoch": 4.676209952283572, "step": 3430}, {"loss": 1.3186, "grad_norm": 0.9094610810279846, "learning_rate": 0.0002, "epoch": 4.689843217450579, "step": 3440}, {"loss": 1.3397, "grad_norm": 0.9012220501899719, "learning_rate": 0.0002, "epoch": 4.703476482617587, "step": 3450}, {"loss": 1.2595, "grad_norm": 0.8669242858886719, "learning_rate": 0.0002, "epoch": 4.717109747784594, "step": 3460}, {"loss": 1.2762, "grad_norm": 0.9753699898719788, "learning_rate": 0.0002, "epoch": 4.730743012951602, "step": 3470}, {"loss": 1.2371, "grad_norm": 1.0252684354782104, "learning_rate": 0.0002, "epoch": 4.74437627811861, "step": 3480}, {"loss": 1.2536, "grad_norm": 1.208098292350769, "learning_rate": 0.0002, "epoch": 4.758009543285617, "step": 3490}, {"loss": 1.2256, "grad_norm": 0.8632914423942566, "learning_rate": 0.0002, "epoch": 4.771642808452625, "step": 3500}, {"loss": 1.3062, "grad_norm": 1.0084818601608276, "learning_rate": 0.0002, "epoch": 4.785276073619632, "step": 3510}, {"loss": 1.3004, "grad_norm": 0.9095172882080078, "learning_rate": 0.0002, "epoch": 4.79890933878664, "step": 3520}, {"loss": 1.263, "grad_norm": 0.9740135669708252, "learning_rate": 0.0002, "epoch": 4.812542603953647, "step": 3530}, {"loss": 1.2816, "grad_norm": 0.8862348794937134, "learning_rate": 0.0002, "epoch": 4.826175869120655, "step": 3540}, {"loss": 1.2275, "grad_norm": 1.0761774778366089, "learning_rate": 0.0002, "epoch": 4.839809134287662, "step": 3550}, {"loss": 1.2257, "grad_norm": 1.0134117603302002, "learning_rate": 0.0002, "epoch": 4.85344239945467, "step": 3560}, {"loss": 1.2904, "grad_norm": 0.9262851476669312, "learning_rate": 0.0002, "epoch": 4.867075664621677, "step": 3570}, {"loss": 1.1466, "grad_norm": 0.9518504738807678, "learning_rate": 0.0002, "epoch": 4.8807089297886845, "step": 3580}, {"loss": 1.2741, "grad_norm": 1.10103178024292, "learning_rate": 0.0002, "epoch": 4.894342194955692, "step": 3590}, {"loss": 1.2592, "grad_norm": 1.0133225917816162, "learning_rate": 0.0002, "epoch": 4.9079754601226995, "step": 3600}, {"loss": 1.2856, "grad_norm": 0.9637737274169922, "learning_rate": 0.0002, "epoch": 4.9216087252897065, "step": 3610}, {"loss": 1.2991, "grad_norm": 0.9800633192062378, "learning_rate": 0.0002, "epoch": 4.935241990456714, "step": 3620}, {"loss": 1.2872, "grad_norm": 1.0065973997116089, "learning_rate": 0.0002, "epoch": 4.948875255623722, "step": 3630}, {"loss": 1.2408, "grad_norm": 0.9354690313339233, "learning_rate": 0.0002, "epoch": 4.962508520790729, "step": 3640}, {"loss": 1.291, "grad_norm": 0.9744119048118591, "learning_rate": 0.0002, "epoch": 4.976141785957737, "step": 3650}, {"loss": 1.2513, "grad_norm": 0.9357708096504211, "learning_rate": 0.0002, "epoch": 4.989775051124744, "step": 3660}, {"eval_loss": 2.0763096809387207, "eval_runtime": 53.6578, "eval_samples_per_second": 9.449, "eval_steps_per_second": 1.193, "epoch": 4.999318336741649, "step": 3667}, {"loss": 1.2323, "grad_norm": 1.3171669244766235, "learning_rate": 0.0002, "epoch": 5.003408316291752, "step": 3670}, {"loss": 0.9509, "grad_norm": 1.4427374601364136, "learning_rate": 0.0002, "epoch": 5.017041581458759, "step": 3680}, {"loss": 1.011, "grad_norm": 0.9313797354698181, "learning_rate": 0.0002, "epoch": 5.030674846625767, "step": 3690}, {"loss": 0.9481, "grad_norm": 1.417641282081604, "learning_rate": 0.0002, "epoch": 5.044308111792774, "step": 3700}, {"loss": 0.9477, "grad_norm": 1.097440242767334, "learning_rate": 0.0002, "epoch": 5.057941376959782, "step": 3710}, {"loss": 1.0416, "grad_norm": 1.4277986288070679, "learning_rate": 0.0002, "epoch": 5.071574642126789, "step": 3720}, {"loss": 0.9718, "grad_norm": 1.2520873546600342, "learning_rate": 0.0002, "epoch": 5.085207907293797, "step": 3730}, {"loss": 0.9531, "grad_norm": 1.39503812789917, "learning_rate": 0.0002, "epoch": 5.098841172460804, "step": 3740}, {"loss": 0.9658, "grad_norm": 1.2345329523086548, "learning_rate": 0.0002, "epoch": 5.112474437627812, "step": 3750}, {"loss": 1.0615, "grad_norm": 1.2700239419937134, "learning_rate": 0.0002, "epoch": 5.126107702794819, "step": 3760}, {"loss": 0.993, "grad_norm": 1.5343066453933716, "learning_rate": 0.0002, "epoch": 5.139740967961827, "step": 3770}, {"loss": 0.9378, "grad_norm": 1.4191608428955078, "learning_rate": 0.0002, "epoch": 5.153374233128835, "step": 3780}, {"loss": 1.0179, "grad_norm": 1.4591023921966553, "learning_rate": 0.0002, "epoch": 5.167007498295842, "step": 3790}, {"loss": 1.0143, "grad_norm": 1.6158121824264526, "learning_rate": 0.0002, "epoch": 5.18064076346285, "step": 3800}, {"loss": 1.0056, "grad_norm": 1.6077582836151123, "learning_rate": 0.0002, "epoch": 5.194274028629857, "step": 3810}, {"loss": 0.9711, "grad_norm": 1.2815653085708618, "learning_rate": 0.0002, "epoch": 5.207907293796865, "step": 3820}, {"loss": 1.0131, "grad_norm": 1.2427219152450562, "learning_rate": 0.0002, "epoch": 5.221540558963872, "step": 3830}, {"loss": 0.9901, "grad_norm": 1.3013232946395874, "learning_rate": 0.0002, "epoch": 5.23517382413088, "step": 3840}, {"loss": 0.9862, "grad_norm": 1.4643588066101074, "learning_rate": 0.0002, "epoch": 5.248807089297887, "step": 3850}, {"loss": 1.0149, "grad_norm": 1.2571916580200195, "learning_rate": 0.0002, "epoch": 5.2624403544648946, "step": 3860}, {"loss": 0.9686, "grad_norm": 1.226682186126709, "learning_rate": 0.0002, "epoch": 5.276073619631902, "step": 3870}, {"loss": 0.9417, "grad_norm": 1.2541271448135376, "learning_rate": 0.0002, "epoch": 5.2897068847989095, "step": 3880}, {"loss": 0.9767, "grad_norm": 1.2340261936187744, "learning_rate": 0.0002, "epoch": 5.3033401499659165, "step": 3890}, {"loss": 1.0173, "grad_norm": 1.345527172088623, "learning_rate": 0.0002, "epoch": 5.316973415132924, "step": 3900}, {"loss": 1.0638, "grad_norm": 1.2128909826278687, "learning_rate": 0.0002, "epoch": 5.3306066802999315, "step": 3910}, {"loss": 1.0002, "grad_norm": 1.3052637577056885, "learning_rate": 0.0002, "epoch": 5.344239945466939, "step": 3920}, {"loss": 0.9754, "grad_norm": 1.1017392873764038, "learning_rate": 0.0002, "epoch": 5.357873210633947, "step": 3930}, {"loss": 1.0579, "grad_norm": 1.26950204372406, "learning_rate": 0.0002, "epoch": 5.371506475800954, "step": 3940}, {"loss": 1.0816, "grad_norm": 1.3372546434402466, "learning_rate": 0.0002, "epoch": 5.385139740967962, "step": 3950}, {"loss": 1.0529, "grad_norm": 1.3115156888961792, "learning_rate": 0.0002, "epoch": 5.398773006134969, "step": 3960}, {"loss": 1.1179, "grad_norm": 1.3511474132537842, "learning_rate": 0.0002, "epoch": 5.412406271301977, "step": 3970}, {"loss": 1.0352, "grad_norm": 1.1001893281936646, "learning_rate": 0.0002, "epoch": 5.426039536468984, "step": 3980}, {"loss": 1.0855, "grad_norm": 1.2810745239257812, "learning_rate": 0.0002, "epoch": 5.439672801635992, "step": 3990}, {"loss": 1.0573, "grad_norm": 1.2999306917190552, "learning_rate": 0.0002, "epoch": 5.453306066802999, "step": 4000}, {"loss": 1.0073, "grad_norm": 1.172553300857544, "learning_rate": 0.0002, "epoch": 5.466939331970007, "step": 4010}, {"loss": 1.003, "grad_norm": 1.1483557224273682, "learning_rate": 0.0002, "epoch": 5.480572597137014, "step": 4020}, {"loss": 1.0704, "grad_norm": 1.4148036241531372, "learning_rate": 0.0002, "epoch": 5.494205862304022, "step": 4030}, {"loss": 1.0519, "grad_norm": 1.1611121892929077, "learning_rate": 0.0002, "epoch": 5.507839127471029, "step": 4040}, {"loss": 1.0775, "grad_norm": 1.3837119340896606, "learning_rate": 0.0002, "epoch": 5.521472392638037, "step": 4050}, {"loss": 1.0257, "grad_norm": 1.3025696277618408, "learning_rate": 0.0002, "epoch": 5.535105657805044, "step": 4060}, {"loss": 1.0628, "grad_norm": 1.348091959953308, "learning_rate": 0.0002, "epoch": 5.548738922972052, "step": 4070}, {"loss": 1.0447, "grad_norm": 1.3463449478149414, "learning_rate": 0.0002, "epoch": 5.56237218813906, "step": 4080}, {"loss": 1.039, "grad_norm": 1.3904176950454712, "learning_rate": 0.0002, "epoch": 5.576005453306067, "step": 4090}, {"loss": 1.0963, "grad_norm": 1.2737950086593628, "learning_rate": 0.0002, "epoch": 5.589638718473074, "step": 4100}, {"loss": 1.0441, "grad_norm": 1.3311827182769775, "learning_rate": 0.0002, "epoch": 5.603271983640082, "step": 4110}, {"loss": 1.0521, "grad_norm": 1.24485182762146, "learning_rate": 0.0002, "epoch": 5.61690524880709, "step": 4120}, {"loss": 1.1103, "grad_norm": 1.2724957466125488, "learning_rate": 0.0002, "epoch": 5.630538513974097, "step": 4130}, {"loss": 1.0588, "grad_norm": 1.3439847230911255, "learning_rate": 0.0002, "epoch": 5.644171779141105, "step": 4140}, {"loss": 1.0257, "grad_norm": 1.372359037399292, "learning_rate": 0.0002, "epoch": 5.657805044308112, "step": 4150}, {"loss": 1.0475, "grad_norm": 1.2322949171066284, "learning_rate": 0.0002, "epoch": 5.6714383094751195, "step": 4160}, {"loss": 1.0465, "grad_norm": 1.4859193563461304, "learning_rate": 0.0002, "epoch": 5.6850715746421265, "step": 4170}, {"loss": 1.1569, "grad_norm": 1.4318448305130005, "learning_rate": 0.0002, "epoch": 5.6987048398091344, "step": 4180}, {"loss": 1.017, "grad_norm": 1.1533565521240234, "learning_rate": 0.0002, "epoch": 5.7123381049761415, "step": 4190}, {"loss": 1.0948, "grad_norm": 1.3009696006774902, "learning_rate": 0.0002, "epoch": 5.725971370143149, "step": 4200}, {"loss": 1.1229, "grad_norm": 1.3972162008285522, "learning_rate": 0.0002, "epoch": 5.739604635310156, "step": 4210}, {"loss": 1.033, "grad_norm": 1.2142186164855957, "learning_rate": 0.0002, "epoch": 5.753237900477164, "step": 4220}, {"loss": 1.0588, "grad_norm": 1.401191234588623, "learning_rate": 0.0002, "epoch": 5.766871165644172, "step": 4230}, {"loss": 1.0722, "grad_norm": 1.4124404191970825, "learning_rate": 0.0002, "epoch": 5.780504430811179, "step": 4240}, {"loss": 1.0826, "grad_norm": 1.3488332033157349, "learning_rate": 0.0002, "epoch": 5.794137695978186, "step": 4250}, {"loss": 1.0599, "grad_norm": 1.3671752214431763, "learning_rate": 0.0002, "epoch": 5.807770961145194, "step": 4260}, {"loss": 1.1294, "grad_norm": 1.2608201503753662, "learning_rate": 0.0002, "epoch": 5.821404226312202, "step": 4270}, {"loss": 1.1216, "grad_norm": 1.1814045906066895, "learning_rate": 0.0002, "epoch": 5.835037491479209, "step": 4280}, {"loss": 1.0973, "grad_norm": 1.4139586687088013, "learning_rate": 0.0002, "epoch": 5.848670756646217, "step": 4290}, {"loss": 1.0656, "grad_norm": 1.34248948097229, "learning_rate": 0.0002, "epoch": 5.862304021813224, "step": 4300}, {"loss": 1.0791, "grad_norm": 1.1428139209747314, "learning_rate": 0.0002, "epoch": 5.875937286980232, "step": 4310}, {"loss": 1.0556, "grad_norm": 1.1941087245941162, "learning_rate": 0.0002, "epoch": 5.889570552147239, "step": 4320}, {"loss": 1.1089, "grad_norm": 1.2374001741409302, "learning_rate": 0.0002, "epoch": 5.903203817314247, "step": 4330}, {"loss": 1.0802, "grad_norm": 1.4314988851547241, "learning_rate": 0.0002, "epoch": 5.916837082481254, "step": 4340}, {"loss": 1.133, "grad_norm": 1.1286126375198364, "learning_rate": 0.0002, "epoch": 5.930470347648262, "step": 4350}, {"loss": 1.0807, "grad_norm": 1.25884211063385, "learning_rate": 0.0002, "epoch": 5.944103612815269, "step": 4360}, {"loss": 1.1189, "grad_norm": 1.223357915878296, "learning_rate": 0.0002, "epoch": 5.957736877982277, "step": 4370}, {"loss": 1.1335, "grad_norm": 1.2173810005187988, "learning_rate": 0.0002, "epoch": 5.971370143149285, "step": 4380}, {"loss": 1.1201, "grad_norm": 1.3152292966842651, "learning_rate": 0.0002, "epoch": 5.985003408316292, "step": 4390}, {"loss": 1.1456, "grad_norm": 1.5576739311218262, "learning_rate": 0.0002, "epoch": 5.998636673483299, "step": 4400}, {"eval_loss": 2.3435311317443848, "eval_runtime": 53.6362, "eval_samples_per_second": 9.453, "eval_steps_per_second": 1.193, "epoch": 6.0, "step": 4401}, {"loss": 0.8618, "grad_norm": 2.027981758117676, "learning_rate": 0.0002, "epoch": 6.012269938650307, "step": 4410}, {"loss": 0.7702, "grad_norm": 1.4775491952896118, "learning_rate": 0.0002, "epoch": 6.025903203817315, "step": 4420}, {"loss": 0.8042, "grad_norm": 1.6902967691421509, "learning_rate": 0.0002, "epoch": 6.039536468984322, "step": 4430}, {"loss": 0.7363, "grad_norm": 1.2506479024887085, "learning_rate": 0.0002, "epoch": 6.0531697341513295, "step": 4440}, {"loss": 0.7653, "grad_norm": 1.5935661792755127, "learning_rate": 0.0002, "epoch": 6.0668029993183366, "step": 4450}, {"loss": 0.7869, "grad_norm": 1.2966011762619019, "learning_rate": 0.0002, "epoch": 6.0804362644853445, "step": 4460}, {"loss": 0.7186, "grad_norm": 1.5247948169708252, "learning_rate": 0.0002, "epoch": 6.0940695296523515, "step": 4470}, {"loss": 0.7864, "grad_norm": 1.6415225267410278, "learning_rate": 0.0002, "epoch": 6.107702794819359, "step": 4480}, {"loss": 0.7561, "grad_norm": 1.5510778427124023, "learning_rate": 0.0002, "epoch": 6.121336059986366, "step": 4490}, {"loss": 0.7628, "grad_norm": 1.361097812652588, "learning_rate": 0.0002, "epoch": 6.134969325153374, "step": 4500}, {"loss": 0.8053, "grad_norm": 1.8347383737564087, "learning_rate": 0.0002, "epoch": 6.148602590320381, "step": 4510}, {"loss": 0.8074, "grad_norm": 1.570560097694397, "learning_rate": 0.0002, "epoch": 6.162235855487389, "step": 4520}, {"loss": 0.7536, "grad_norm": 1.517993688583374, "learning_rate": 0.0002, "epoch": 6.175869120654396, "step": 4530}, {"loss": 0.8028, "grad_norm": 1.4517489671707153, "learning_rate": 0.0002, "epoch": 6.189502385821404, "step": 4540}, {"loss": 0.8633, "grad_norm": 1.557098627090454, "learning_rate": 0.0002, "epoch": 6.203135650988412, "step": 4550}, {"loss": 0.7704, "grad_norm": 1.7379891872406006, "learning_rate": 0.0002, "epoch": 6.216768916155419, "step": 4560}, {"loss": 0.7341, "grad_norm": 2.2292542457580566, "learning_rate": 0.0002, "epoch": 6.230402181322427, "step": 4570}, {"loss": 0.7883, "grad_norm": 1.834366798400879, "learning_rate": 0.0002, "epoch": 6.244035446489434, "step": 4580}, {"loss": 0.8222, "grad_norm": 1.6755090951919556, "learning_rate": 0.0002, "epoch": 6.257668711656442, "step": 4590}, {"loss": 0.8245, "grad_norm": 1.828898549079895, "learning_rate": 0.0002, "epoch": 6.271301976823449, "step": 4600}, {"loss": 0.8116, "grad_norm": 1.9773457050323486, "learning_rate": 0.0002, "epoch": 6.284935241990457, "step": 4610}, {"loss": 0.811, "grad_norm": 1.533369541168213, "learning_rate": 0.0002, "epoch": 6.298568507157464, "step": 4620}, {"loss": 0.807, "grad_norm": 1.5432997941970825, "learning_rate": 0.0002, "epoch": 6.312201772324472, "step": 4630}, {"loss": 0.818, "grad_norm": 1.6686866283416748, "learning_rate": 0.0002, "epoch": 6.325835037491479, "step": 4640}, {"loss": 0.8656, "grad_norm": 1.545304298400879, "learning_rate": 0.0002, "epoch": 6.339468302658487, "step": 4650}, {"loss": 0.8239, "grad_norm": 1.5981945991516113, "learning_rate": 0.0002, "epoch": 6.353101567825494, "step": 4660}, {"loss": 0.8162, "grad_norm": 1.6973154544830322, "learning_rate": 0.0002, "epoch": 6.366734832992502, "step": 4670}, {"loss": 0.8377, "grad_norm": 1.6782612800598145, "learning_rate": 0.0002, "epoch": 6.38036809815951, "step": 4680}, {"loss": 0.8185, "grad_norm": 1.5710086822509766, "learning_rate": 0.0002, "epoch": 6.394001363326517, "step": 4690}, {"loss": 0.7948, "grad_norm": 1.7241147756576538, "learning_rate": 0.0002, "epoch": 6.407634628493525, "step": 4700}, {"loss": 0.8768, "grad_norm": 1.7736736536026, "learning_rate": 0.0002, "epoch": 6.421267893660532, "step": 4710}, {"loss": 0.8607, "grad_norm": 1.7924901247024536, "learning_rate": 0.0002, "epoch": 6.4349011588275395, "step": 4720}, {"loss": 0.832, "grad_norm": 1.4030500650405884, "learning_rate": 0.0002, "epoch": 6.448534423994547, "step": 4730}, {"loss": 0.8806, "grad_norm": 1.6925519704818726, "learning_rate": 0.0002, "epoch": 6.4621676891615545, "step": 4740}, {"loss": 0.8556, "grad_norm": 1.362905502319336, "learning_rate": 0.0002, "epoch": 6.4758009543285615, "step": 4750}, {"loss": 0.838, "grad_norm": 1.5281150341033936, "learning_rate": 0.0002, "epoch": 6.489434219495569, "step": 4760}, {"loss": 0.8396, "grad_norm": 1.524671196937561, "learning_rate": 0.0002, "epoch": 6.5030674846625764, "step": 4770}, {"loss": 0.8225, "grad_norm": 1.7029320001602173, "learning_rate": 0.0002, "epoch": 6.516700749829584, "step": 4780}, {"loss": 0.8377, "grad_norm": 1.4663511514663696, "learning_rate": 0.0002, "epoch": 6.530334014996591, "step": 4790}, {"loss": 0.8018, "grad_norm": 1.7682101726531982, "learning_rate": 0.0002, "epoch": 6.543967280163599, "step": 4800}, {"loss": 0.8318, "grad_norm": 1.6056565046310425, "learning_rate": 0.0002, "epoch": 6.557600545330606, "step": 4810}, {"loss": 0.8747, "grad_norm": 1.6552391052246094, "learning_rate": 0.0002, "epoch": 6.571233810497614, "step": 4820}, {"loss": 0.8559, "grad_norm": 1.4265215396881104, "learning_rate": 0.0002, "epoch": 6.584867075664622, "step": 4830}, {"loss": 0.8606, "grad_norm": 1.6225470304489136, "learning_rate": 0.0002, "epoch": 6.598500340831629, "step": 4840}, {"loss": 0.8817, "grad_norm": 1.6568684577941895, "learning_rate": 0.0002, "epoch": 6.612133605998636, "step": 4850}, {"loss": 0.8825, "grad_norm": 1.760115146636963, "learning_rate": 0.0002, "epoch": 6.625766871165644, "step": 4860}, {"loss": 0.9227, "grad_norm": 1.627966046333313, "learning_rate": 0.0002, "epoch": 6.639400136332652, "step": 4870}, {"loss": 0.8825, "grad_norm": 1.7053254842758179, "learning_rate": 0.0002, "epoch": 6.653033401499659, "step": 4880}, {"loss": 0.857, "grad_norm": 1.5339484214782715, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 4890}, {"loss": 0.8482, "grad_norm": 1.5594874620437622, "learning_rate": 0.0002, "epoch": 6.680299931833674, "step": 4900}, {"loss": 0.842, "grad_norm": 1.5322152376174927, "learning_rate": 0.0002, "epoch": 6.693933197000682, "step": 4910}, {"loss": 0.8049, "grad_norm": 1.733410358428955, "learning_rate": 0.0002, "epoch": 6.707566462167689, "step": 4920}, {"loss": 0.9099, "grad_norm": 1.3626887798309326, "learning_rate": 0.0002, "epoch": 6.721199727334697, "step": 4930}, {"loss": 0.9481, "grad_norm": 1.6323494911193848, "learning_rate": 0.0002, "epoch": 6.734832992501704, "step": 4940}, {"loss": 0.8803, "grad_norm": 1.6548917293548584, "learning_rate": 0.0002, "epoch": 6.748466257668712, "step": 4950}, {"loss": 0.9149, "grad_norm": 1.7894278764724731, "learning_rate": 0.0002, "epoch": 6.762099522835719, "step": 4960}, {"loss": 0.9137, "grad_norm": 1.7960841655731201, "learning_rate": 0.0002, "epoch": 6.775732788002727, "step": 4970}, {"loss": 0.9088, "grad_norm": 1.4888852834701538, "learning_rate": 0.0002, "epoch": 6.789366053169735, "step": 4980}, {"loss": 0.9495, "grad_norm": 1.6368865966796875, "learning_rate": 0.0002, "epoch": 6.802999318336742, "step": 4990}, {"loss": 0.9939, "grad_norm": 1.7106667757034302, "learning_rate": 0.0002, "epoch": 6.816632583503749, "step": 5000}, {"loss": 0.8551, "grad_norm": 4.131956100463867, "learning_rate": 0.0002, "epoch": 6.830265848670757, "step": 5010}, {"loss": 0.908, "grad_norm": 1.6357536315917969, "learning_rate": 0.0002, "epoch": 6.8438991138377645, "step": 5020}, {"loss": 0.8661, "grad_norm": 1.621524453163147, "learning_rate": 0.0002, "epoch": 6.8575323790047715, "step": 5030}, {"loss": 0.9177, "grad_norm": 1.6400790214538574, "learning_rate": 0.0002, "epoch": 6.871165644171779, "step": 5040}, {"loss": 0.9204, "grad_norm": 1.823006272315979, "learning_rate": 0.0002, "epoch": 6.8847989093387865, "step": 5050}, {"loss": 0.9133, "grad_norm": 1.6328210830688477, "learning_rate": 0.0002, "epoch": 6.898432174505794, "step": 5060}, {"loss": 0.9138, "grad_norm": 1.3616089820861816, "learning_rate": 0.0002, "epoch": 6.912065439672801, "step": 5070}, {"loss": 0.8791, "grad_norm": 1.7202986478805542, "learning_rate": 0.0002, "epoch": 6.925698704839809, "step": 5080}, {"loss": 0.8331, "grad_norm": 1.8145297765731812, "learning_rate": 0.0002, "epoch": 6.939331970006816, "step": 5090}, {"loss": 0.861, "grad_norm": 1.5432910919189453, "learning_rate": 0.0002, "epoch": 6.952965235173824, "step": 5100}, {"loss": 0.9282, "grad_norm": 1.2784099578857422, "learning_rate": 0.0002, "epoch": 6.966598500340831, "step": 5110}, {"loss": 0.9189, "grad_norm": 1.556593894958496, "learning_rate": 0.0002, "epoch": 6.980231765507839, "step": 5120}, {"loss": 0.8961, "grad_norm": 1.5102856159210205, "learning_rate": 0.0002, "epoch": 6.993865030674847, "step": 5130}, {"eval_loss": 2.5376713275909424, "eval_runtime": 53.6377, "eval_samples_per_second": 9.452, "eval_steps_per_second": 1.193, "epoch": 6.999318336741649, "step": 5134}, {"loss": 0.7888, "grad_norm": 1.7083442211151123, "learning_rate": 0.0002, "epoch": 7.007498295841854, "step": 5140}, {"loss": 0.581, "grad_norm": 1.95943021774292, "learning_rate": 0.0002, "epoch": 7.021131561008862, "step": 5150}, {"loss": 0.5631, "grad_norm": 1.453168511390686, "learning_rate": 0.0002, "epoch": 7.034764826175869, "step": 5160}, {"loss": 0.5871, "grad_norm": 2.110145092010498, "learning_rate": 0.0002, "epoch": 7.048398091342877, "step": 5170}, {"loss": 0.5418, "grad_norm": 1.567636489868164, "learning_rate": 0.0002, "epoch": 7.062031356509884, "step": 5180}, {"loss": 0.611, "grad_norm": 1.8596835136413574, "learning_rate": 0.0002, "epoch": 7.075664621676892, "step": 5190}, {"loss": 0.6174, "grad_norm": 1.7342605590820312, "learning_rate": 0.0002, "epoch": 7.089297886843899, "step": 5200}, {"loss": 0.5632, "grad_norm": 1.516591191291809, "learning_rate": 0.0002, "epoch": 7.102931152010907, "step": 5210}, {"loss": 0.5977, "grad_norm": 1.7696505784988403, "learning_rate": 0.0002, "epoch": 7.116564417177914, "step": 5220}, {"loss": 0.594, "grad_norm": 2.1680636405944824, "learning_rate": 0.0002, "epoch": 7.130197682344922, "step": 5230}, {"loss": 0.5885, "grad_norm": 1.6825456619262695, "learning_rate": 0.0002, "epoch": 7.143830947511929, "step": 5240}, {"loss": 0.6169, "grad_norm": 2.036949634552002, "learning_rate": 0.0002, "epoch": 7.157464212678937, "step": 5250}, {"loss": 0.6561, "grad_norm": 1.8820315599441528, "learning_rate": 0.0002, "epoch": 7.171097477845944, "step": 5260}, {"loss": 0.6445, "grad_norm": 2.313140630722046, "learning_rate": 0.0002, "epoch": 7.184730743012952, "step": 5270}, {"loss": 0.6265, "grad_norm": 2.0305309295654297, "learning_rate": 0.0002, "epoch": 7.198364008179959, "step": 5280}, {"loss": 0.5909, "grad_norm": 1.707711100578308, "learning_rate": 0.0002, "epoch": 7.211997273346967, "step": 5290}, {"loss": 0.6321, "grad_norm": 1.687009334564209, "learning_rate": 0.0002, "epoch": 7.2256305385139745, "step": 5300}, {"loss": 0.6435, "grad_norm": 2.0011701583862305, "learning_rate": 0.0002, "epoch": 7.2392638036809815, "step": 5310}, {"loss": 0.5955, "grad_norm": 1.9455368518829346, "learning_rate": 0.0002, "epoch": 7.2528970688479895, "step": 5320}, {"loss": 0.6644, "grad_norm": 1.5780237913131714, "learning_rate": 0.0002, "epoch": 7.2665303340149965, "step": 5330}, {"loss": 0.6007, "grad_norm": 2.1882123947143555, "learning_rate": 0.0002, "epoch": 7.280163599182004, "step": 5340}, {"loss": 0.6055, "grad_norm": 2.089590549468994, "learning_rate": 0.0002, "epoch": 7.293796864349011, "step": 5350}, {"loss": 0.6138, "grad_norm": 1.8626707792282104, "learning_rate": 0.0002, "epoch": 7.307430129516019, "step": 5360}, {"loss": 0.6269, "grad_norm": 2.127977132797241, "learning_rate": 0.0002, "epoch": 7.321063394683026, "step": 5370}, {"loss": 0.6629, "grad_norm": 1.6568187475204468, "learning_rate": 0.0002, "epoch": 7.334696659850034, "step": 5380}, {"loss": 0.6263, "grad_norm": 1.5592522621154785, "learning_rate": 0.0002, "epoch": 7.348329925017041, "step": 5390}, {"loss": 0.6351, "grad_norm": 1.7897852659225464, "learning_rate": 0.0002, "epoch": 7.361963190184049, "step": 5400}, {"loss": 0.6319, "grad_norm": 2.071516275405884, "learning_rate": 0.0002, "epoch": 7.375596455351056, "step": 5410}, {"loss": 0.6486, "grad_norm": 2.048238515853882, "learning_rate": 0.0002, "epoch": 7.389229720518064, "step": 5420}, {"loss": 0.7037, "grad_norm": 1.770015001296997, "learning_rate": 0.0002, "epoch": 7.402862985685071, "step": 5430}, {"loss": 0.6719, "grad_norm": 1.7530136108398438, "learning_rate": 0.0002, "epoch": 7.416496250852079, "step": 5440}, {"loss": 0.6392, "grad_norm": 1.8113389015197754, "learning_rate": 0.0002, "epoch": 7.430129516019086, "step": 5450}, {"loss": 0.6589, "grad_norm": 1.8129119873046875, "learning_rate": 0.0002, "epoch": 7.443762781186094, "step": 5460}, {"loss": 0.6786, "grad_norm": 1.7961417436599731, "learning_rate": 0.0002, "epoch": 7.457396046353102, "step": 5470}, {"loss": 0.6492, "grad_norm": 1.8811243772506714, "learning_rate": 0.0002, "epoch": 7.471029311520109, "step": 5480}, {"loss": 0.6442, "grad_norm": 1.9619536399841309, "learning_rate": 0.0002, "epoch": 7.484662576687117, "step": 5490}, {"loss": 0.6372, "grad_norm": 1.9449920654296875, "learning_rate": 0.0002, "epoch": 7.498295841854124, "step": 5500}, {"loss": 0.6543, "grad_norm": 2.0600240230560303, "learning_rate": 0.0002, "epoch": 7.511929107021132, "step": 5510}, {"loss": 0.6695, "grad_norm": 1.9339587688446045, "learning_rate": 0.0002, "epoch": 7.525562372188139, "step": 5520}, {"loss": 0.7385, "grad_norm": 2.0672056674957275, "learning_rate": 0.0002, "epoch": 7.539195637355147, "step": 5530}, {"loss": 0.6678, "grad_norm": 1.8305774927139282, "learning_rate": 0.0002, "epoch": 7.552828902522154, "step": 5540}, {"loss": 0.7, "grad_norm": 1.9546589851379395, "learning_rate": 0.0002, "epoch": 7.566462167689162, "step": 5550}, {"loss": 0.6481, "grad_norm": 1.657498836517334, "learning_rate": 0.0002, "epoch": 7.580095432856169, "step": 5560}, {"loss": 0.6604, "grad_norm": 2.0222396850585938, "learning_rate": 0.0002, "epoch": 7.593728698023177, "step": 5570}, {"loss": 0.7154, "grad_norm": 1.9352941513061523, "learning_rate": 0.0002, "epoch": 7.6073619631901845, "step": 5580}, {"loss": 0.6926, "grad_norm": 1.9743294715881348, "learning_rate": 0.0002, "epoch": 7.620995228357192, "step": 5590}, {"loss": 0.6948, "grad_norm": 1.949228048324585, "learning_rate": 0.0002, "epoch": 7.634628493524199, "step": 5600}, {"loss": 0.6755, "grad_norm": 2.009384870529175, "learning_rate": 0.0002, "epoch": 7.6482617586912065, "step": 5610}, {"loss": 0.7467, "grad_norm": 1.9622714519500732, "learning_rate": 0.0002, "epoch": 7.661895023858214, "step": 5620}, {"loss": 0.7047, "grad_norm": 2.142486810684204, "learning_rate": 0.0002, "epoch": 7.675528289025221, "step": 5630}, {"loss": 0.6908, "grad_norm": 2.4306538105010986, "learning_rate": 0.0002, "epoch": 7.689161554192229, "step": 5640}, {"loss": 0.6592, "grad_norm": 1.8343422412872314, "learning_rate": 0.0002, "epoch": 7.702794819359236, "step": 5650}, {"loss": 0.7052, "grad_norm": 2.1571617126464844, "learning_rate": 0.0002, "epoch": 7.716428084526244, "step": 5660}, {"loss": 0.6958, "grad_norm": 2.028083086013794, "learning_rate": 0.0002, "epoch": 7.730061349693251, "step": 5670}, {"loss": 0.6896, "grad_norm": 2.0310823917388916, "learning_rate": 0.0002, "epoch": 7.743694614860259, "step": 5680}, {"loss": 0.6927, "grad_norm": 1.9675135612487793, "learning_rate": 0.0002, "epoch": 7.757327880027266, "step": 5690}, {"loss": 0.6985, "grad_norm": 2.082470417022705, "learning_rate": 0.0002, "epoch": 7.770961145194274, "step": 5700}, {"loss": 0.7519, "grad_norm": 1.8454886674880981, "learning_rate": 0.0002, "epoch": 7.784594410361281, "step": 5710}, {"loss": 0.7407, "grad_norm": 2.0777692794799805, "learning_rate": 0.0002, "epoch": 7.798227675528289, "step": 5720}, {"loss": 0.7149, "grad_norm": 1.751173496246338, "learning_rate": 0.0002, "epoch": 7.811860940695297, "step": 5730}, {"loss": 0.7126, "grad_norm": 1.7728252410888672, "learning_rate": 0.0002, "epoch": 7.825494205862304, "step": 5740}, {"loss": 0.7494, "grad_norm": 1.9239917993545532, "learning_rate": 0.0002, "epoch": 7.839127471029311, "step": 5750}, {"loss": 0.7374, "grad_norm": 2.0526111125946045, "learning_rate": 0.0002, "epoch": 7.852760736196319, "step": 5760}, {"loss": 0.713, "grad_norm": 2.097938060760498, "learning_rate": 0.0002, "epoch": 7.866394001363327, "step": 5770}, {"loss": 0.7391, "grad_norm": 1.8992373943328857, "learning_rate": 0.0002, "epoch": 7.880027266530334, "step": 5780}, {"loss": 0.6744, "grad_norm": 1.812042474746704, "learning_rate": 0.0002, "epoch": 7.893660531697342, "step": 5790}, {"loss": 0.7803, "grad_norm": 1.9535222053527832, "learning_rate": 0.0002, "epoch": 7.907293796864349, "step": 5800}, {"loss": 0.6998, "grad_norm": 2.0650830268859863, "learning_rate": 0.0002, "epoch": 7.920927062031357, "step": 5810}, {"loss": 0.7134, "grad_norm": 1.818130612373352, "learning_rate": 0.0002, "epoch": 7.934560327198364, "step": 5820}, {"loss": 0.7356, "grad_norm": 1.9505265951156616, "learning_rate": 0.0002, "epoch": 7.948193592365372, "step": 5830}, {"loss": 0.6872, "grad_norm": 2.072112798690796, "learning_rate": 0.0002, "epoch": 7.961826857532379, "step": 5840}, {"loss": 0.6792, "grad_norm": 1.6640431880950928, "learning_rate": 0.0002, "epoch": 7.975460122699387, "step": 5850}, {"loss": 0.7391, "grad_norm": 1.7920113801956177, "learning_rate": 0.0002, "epoch": 7.989093387866394, "step": 5860}]}