diff --git a/.gitattributes b/.gitattributes index d13c36e40ae6cbc37de042069d05a0241a63d314..82ed3f8dbfc0ed9a81e12460b22dfd5463cb0b28 100644 --- a/.gitattributes +++ b/.gitattributes @@ -789,3 +789,12 @@ gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-fixed_num-p-0.03-num-200-sd-42/checkpoint-40/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-fixed_num-p-0.03-num-200-sd-42/checkpoint-81/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-fixed_num-p-0.03-num-200-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b971d2d3728c073474494167a8733e971e412cf --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a973c78893d5c5f843df8157e4f5f5d58bd4afd185281397acca339964604cf +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aca91d18460f5dd0046c61b1bcc21240a46ec829 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c6f69d70eebb972d5f2aea734807cdc237ada8f0fd2bcc1d42cf91146986ff +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c1e95bd59d08cef07e5077d41462b847bfcb154 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27572929ff2cbf856253f58f8e5059eee8bb373275a4d66ee2157c75df2e9641 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e43d3a86d9e002f165c21f20848463d0d9b4a483 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c62348675de8dfb807cfdc1360f392af5cab7689636d8eb0d4df84ca06a498 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d4595f2f1663ea0bd12a695c82cde8c2bf8a19e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc0c19e0ddc643f135d0fdd597ab58ee08d8a244658aab7ba0b5d38110da455 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..81f785078fede5007798b307e9120007bd9cd895 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/trainer_state.json @@ -0,0 +1,974 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 2.998857142857143, + "eval_steps": 10, + "global_step": 1312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.741491613696e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f5e59110eabce72d08ce0237a3c9fc0572c7edb --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1750e91eb527bd511e256a80ba1d623c41bee957c5f1b88c3a3feede9651275f +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..47aeeaf29ac5ae512dc6432b8bc3084c63fb8815 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78544d3cfcb349ba50a10fed20793113a10afd6e720e8d0e5e84cedf15ed694d +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..99ce54da6e759399277a9ae48dc5955dce873c26 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4cd5653f9fb65ce60bb97044c048a65fcc5c435bf82c98d80a311e76d97d93 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..178b869d2471e46c12c962fa4d3f49da80ade0ba --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c637c522eb7be4f20db9bf30934ffbcff195e29b30bd9e03e47a7a09113597c +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d1dff582f37765ea5b94f069e83da291fb056ed --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/trainer_state.json @@ -0,0 +1,1290 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 1750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + }, + { + "epoch": 3.0171428571428573, + "grad_norm": 0.5184893012046814, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1320 + }, + { + "epoch": 3.04, + "grad_norm": 0.5665355920791626, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 1330 + }, + { + "epoch": 3.0628571428571427, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.0002, + "loss": 1.3741, + "step": 1340 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 0.6921621561050415, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 1350 + }, + { + "epoch": 3.1085714285714285, + "grad_norm": 0.6406348943710327, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 1360 + }, + { + "epoch": 3.1314285714285712, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002, + "loss": 1.3563, + "step": 1370 + }, + { + "epoch": 3.1542857142857144, + "grad_norm": 0.683325469493866, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 1380 + }, + { + "epoch": 3.177142857142857, + "grad_norm": 0.6686155200004578, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 1390 + }, + { + "epoch": 3.2, + "grad_norm": 0.8159713745117188, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 1400 + }, + { + "epoch": 3.222857142857143, + "grad_norm": 0.646216094493866, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 1410 + }, + { + "epoch": 3.2457142857142856, + "grad_norm": 0.7323529720306396, + "learning_rate": 0.0002, + "loss": 1.4232, + "step": 1420 + }, + { + "epoch": 3.2685714285714287, + "grad_norm": 0.689349353313446, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1430 + }, + { + "epoch": 3.2914285714285714, + "grad_norm": 0.727894127368927, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1440 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.6921590566635132, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 1450 + }, + { + "epoch": 3.337142857142857, + "grad_norm": 0.6176243424415588, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 1460 + }, + { + "epoch": 3.36, + "grad_norm": 0.9006354212760925, + "learning_rate": 0.0002, + "loss": 1.4323, + "step": 1470 + }, + { + "epoch": 3.382857142857143, + "grad_norm": 0.8145929574966431, + "learning_rate": 0.0002, + "loss": 1.4353, + "step": 1480 + }, + { + "epoch": 3.4057142857142857, + "grad_norm": 0.6640016436576843, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1490 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.7266780138015747, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 1500 + }, + { + "epoch": 3.4514285714285715, + "grad_norm": 0.9351356029510498, + "learning_rate": 0.0002, + "loss": 1.4108, + "step": 1510 + }, + { + "epoch": 3.474285714285714, + "grad_norm": 0.675645649433136, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 1520 + }, + { + "epoch": 3.4971428571428573, + "grad_norm": 0.761472225189209, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 1530 + }, + { + "epoch": 3.52, + "grad_norm": 0.6653069257736206, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1540 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.667412519454956, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 1550 + }, + { + "epoch": 3.565714285714286, + "grad_norm": 0.6395593881607056, + "learning_rate": 0.0002, + "loss": 1.4241, + "step": 1560 + }, + { + "epoch": 3.5885714285714285, + "grad_norm": 0.7588621377944946, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1570 + }, + { + "epoch": 3.611428571428571, + "grad_norm": 0.6206456422805786, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 1580 + }, + { + "epoch": 3.6342857142857143, + "grad_norm": 0.7591291666030884, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 1590 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 0.6476313471794128, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 1600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6731392741203308, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 1610 + }, + { + "epoch": 3.702857142857143, + "grad_norm": 0.725190281867981, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 1620 + }, + { + "epoch": 3.725714285714286, + "grad_norm": 0.6720049977302551, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 1630 + }, + { + "epoch": 3.7485714285714287, + "grad_norm": 0.6301007270812988, + "learning_rate": 0.0002, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.715893566608429, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 1650 + }, + { + "epoch": 3.7942857142857145, + "grad_norm": 0.7539359927177429, + "learning_rate": 0.0002, + "loss": 1.3624, + "step": 1660 + }, + { + "epoch": 3.817142857142857, + "grad_norm": 0.6658543348312378, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 1670 + }, + { + "epoch": 3.84, + "grad_norm": 0.7019526958465576, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 1680 + }, + { + "epoch": 3.862857142857143, + "grad_norm": 0.6517802476882935, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 1690 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.7617332935333252, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1700 + }, + { + "epoch": 3.9085714285714284, + "grad_norm": 0.6919480562210083, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 1710 + }, + { + "epoch": 3.9314285714285715, + "grad_norm": 0.6987943053245544, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 1720 + }, + { + "epoch": 3.954285714285714, + "grad_norm": 0.7062228918075562, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1730 + }, + { + "epoch": 3.977142857142857, + "grad_norm": 0.6769542098045349, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 1740 + }, + { + "epoch": 4.0, + "grad_norm": 0.6832144260406494, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 1750 + }, + { + "epoch": 4.0, + "eval_loss": 1.9474865198135376, + "eval_runtime": 111.288, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1750 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.988655484928e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-1750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..824b95c2b7e49523c0756d62bc3cb36d3c13af69 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1b156501fe3efd5331277d2b001a479c2e4b7a28af955567f4f6646a0ddb0c7 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..21cf4c8eacdd808613116807ba97a81e05d58a0d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02e78e00308b334c9278f652b9d7c9629f126f03ff89e55da48446b1b81ac28 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e2cfeb28991d621d38b81314564cb90650097838 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b926dd98f2307096cd91a16232f4c376f4061f738c6914f96b03494e6fdc2070 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f783b51ae79aa4985c73df8b5ca9af4e84433e9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d8e3dec179fb1345860d00fab246de4b51f381c286b4405ca26f027be2ba498 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b76bd5e6fedf2a32b1e8f50095c67b0f6053da50 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/trainer_state.json @@ -0,0 +1,1599 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 4.998857142857143, + "eval_steps": 10, + "global_step": 2187, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + }, + { + "epoch": 3.0171428571428573, + "grad_norm": 0.5184893012046814, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1320 + }, + { + "epoch": 3.04, + "grad_norm": 0.5665355920791626, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 1330 + }, + { + "epoch": 3.0628571428571427, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.0002, + "loss": 1.3741, + "step": 1340 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 0.6921621561050415, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 1350 + }, + { + "epoch": 3.1085714285714285, + "grad_norm": 0.6406348943710327, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 1360 + }, + { + "epoch": 3.1314285714285712, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002, + "loss": 1.3563, + "step": 1370 + }, + { + "epoch": 3.1542857142857144, + "grad_norm": 0.683325469493866, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 1380 + }, + { + "epoch": 3.177142857142857, + "grad_norm": 0.6686155200004578, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 1390 + }, + { + "epoch": 3.2, + "grad_norm": 0.8159713745117188, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 1400 + }, + { + "epoch": 3.222857142857143, + "grad_norm": 0.646216094493866, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 1410 + }, + { + "epoch": 3.2457142857142856, + "grad_norm": 0.7323529720306396, + "learning_rate": 0.0002, + "loss": 1.4232, + "step": 1420 + }, + { + "epoch": 3.2685714285714287, + "grad_norm": 0.689349353313446, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1430 + }, + { + "epoch": 3.2914285714285714, + "grad_norm": 0.727894127368927, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1440 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.6921590566635132, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 1450 + }, + { + "epoch": 3.337142857142857, + "grad_norm": 0.6176243424415588, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 1460 + }, + { + "epoch": 3.36, + "grad_norm": 0.9006354212760925, + "learning_rate": 0.0002, + "loss": 1.4323, + "step": 1470 + }, + { + "epoch": 3.382857142857143, + "grad_norm": 0.8145929574966431, + "learning_rate": 0.0002, + "loss": 1.4353, + "step": 1480 + }, + { + "epoch": 3.4057142857142857, + "grad_norm": 0.6640016436576843, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1490 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.7266780138015747, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 1500 + }, + { + "epoch": 3.4514285714285715, + "grad_norm": 0.9351356029510498, + "learning_rate": 0.0002, + "loss": 1.4108, + "step": 1510 + }, + { + "epoch": 3.474285714285714, + "grad_norm": 0.675645649433136, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 1520 + }, + { + "epoch": 3.4971428571428573, + "grad_norm": 0.761472225189209, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 1530 + }, + { + "epoch": 3.52, + "grad_norm": 0.6653069257736206, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1540 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.667412519454956, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 1550 + }, + { + "epoch": 3.565714285714286, + "grad_norm": 0.6395593881607056, + "learning_rate": 0.0002, + "loss": 1.4241, + "step": 1560 + }, + { + "epoch": 3.5885714285714285, + "grad_norm": 0.7588621377944946, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1570 + }, + { + "epoch": 3.611428571428571, + "grad_norm": 0.6206456422805786, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 1580 + }, + { + "epoch": 3.6342857142857143, + "grad_norm": 0.7591291666030884, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 1590 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 0.6476313471794128, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 1600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6731392741203308, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 1610 + }, + { + "epoch": 3.702857142857143, + "grad_norm": 0.725190281867981, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 1620 + }, + { + "epoch": 3.725714285714286, + "grad_norm": 0.6720049977302551, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 1630 + }, + { + "epoch": 3.7485714285714287, + "grad_norm": 0.6301007270812988, + "learning_rate": 0.0002, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.715893566608429, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 1650 + }, + { + "epoch": 3.7942857142857145, + "grad_norm": 0.7539359927177429, + "learning_rate": 0.0002, + "loss": 1.3624, + "step": 1660 + }, + { + "epoch": 3.817142857142857, + "grad_norm": 0.6658543348312378, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 1670 + }, + { + "epoch": 3.84, + "grad_norm": 0.7019526958465576, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 1680 + }, + { + "epoch": 3.862857142857143, + "grad_norm": 0.6517802476882935, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 1690 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.7617332935333252, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1700 + }, + { + "epoch": 3.9085714285714284, + "grad_norm": 0.6919480562210083, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 1710 + }, + { + "epoch": 3.9314285714285715, + "grad_norm": 0.6987943053245544, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 1720 + }, + { + "epoch": 3.954285714285714, + "grad_norm": 0.7062228918075562, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1730 + }, + { + "epoch": 3.977142857142857, + "grad_norm": 0.6769542098045349, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 1740 + }, + { + "epoch": 4.0, + "grad_norm": 0.6832144260406494, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 1750 + }, + { + "epoch": 4.0, + "eval_loss": 1.9474865198135376, + "eval_runtime": 111.288, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1750 + }, + { + "epoch": 4.022857142857143, + "grad_norm": 1.064110279083252, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 1760 + }, + { + "epoch": 4.045714285714285, + "grad_norm": 0.8380683660507202, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 1770 + }, + { + "epoch": 4.0685714285714285, + "grad_norm": 1.1863020658493042, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 1780 + }, + { + "epoch": 4.091428571428572, + "grad_norm": 1.0128898620605469, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 1790 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 0.9221312403678894, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 1800 + }, + { + "epoch": 4.137142857142857, + "grad_norm": 1.1298727989196777, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 1810 + }, + { + "epoch": 4.16, + "grad_norm": 0.8854547739028931, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 1820 + }, + { + "epoch": 4.182857142857143, + "grad_norm": 0.8920808434486389, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1830 + }, + { + "epoch": 4.2057142857142855, + "grad_norm": 0.913244366645813, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 1840 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.908831000328064, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1850 + }, + { + "epoch": 4.251428571428572, + "grad_norm": 1.0223685503005981, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 1860 + }, + { + "epoch": 4.274285714285714, + "grad_norm": 0.9771921634674072, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 1870 + }, + { + "epoch": 4.297142857142857, + "grad_norm": 0.9313384890556335, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 1880 + }, + { + "epoch": 4.32, + "grad_norm": 1.0754257440567017, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 1890 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8904672265052795, + "learning_rate": 0.0002, + "loss": 1.2286, + "step": 1900 + }, + { + "epoch": 4.365714285714286, + "grad_norm": 1.046527624130249, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 1910 + }, + { + "epoch": 4.388571428571429, + "grad_norm": 0.9576982855796814, + "learning_rate": 0.0002, + "loss": 1.2368, + "step": 1920 + }, + { + "epoch": 4.411428571428571, + "grad_norm": 0.9278356432914734, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 1930 + }, + { + "epoch": 4.434285714285714, + "grad_norm": 1.1763030290603638, + "learning_rate": 0.0002, + "loss": 1.2005, + "step": 1940 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.9183000326156616, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 1950 + }, + { + "epoch": 4.48, + "grad_norm": 1.050980806350708, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 1960 + }, + { + "epoch": 4.502857142857143, + "grad_norm": 0.9975392818450928, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 1970 + }, + { + "epoch": 4.525714285714286, + "grad_norm": 0.990544319152832, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 1980 + }, + { + "epoch": 4.548571428571429, + "grad_norm": 1.004794955253601, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 1990 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.9294857978820801, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2000 + }, + { + "epoch": 4.594285714285714, + "grad_norm": 0.93436598777771, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 2010 + }, + { + "epoch": 4.617142857142857, + "grad_norm": 0.8704655766487122, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 2020 + }, + { + "epoch": 4.64, + "grad_norm": 0.9077927470207214, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 2030 + }, + { + "epoch": 4.662857142857143, + "grad_norm": 0.912987470626831, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 2040 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.9740643501281738, + "learning_rate": 0.0002, + "loss": 1.2868, + "step": 2050 + }, + { + "epoch": 4.708571428571428, + "grad_norm": 1.133357048034668, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 2060 + }, + { + "epoch": 4.731428571428571, + "grad_norm": 0.8844527006149292, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 2070 + }, + { + "epoch": 4.7542857142857144, + "grad_norm": 1.0083311796188354, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 2080 + }, + { + "epoch": 4.777142857142858, + "grad_norm": 1.000447154045105, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 2090 + }, + { + "epoch": 4.8, + "grad_norm": 0.9620300531387329, + "learning_rate": 0.0002, + "loss": 1.2313, + "step": 2100 + }, + { + "epoch": 4.822857142857143, + "grad_norm": 0.9843335151672363, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 2110 + }, + { + "epoch": 4.845714285714286, + "grad_norm": 0.9906681180000305, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 2120 + }, + { + "epoch": 4.868571428571428, + "grad_norm": 0.9544073939323425, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 2130 + }, + { + "epoch": 4.8914285714285715, + "grad_norm": 0.9392994046211243, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 2140 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 1.104519248008728, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2150 + }, + { + "epoch": 4.937142857142857, + "grad_norm": 0.9495956897735596, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 2160 + }, + { + "epoch": 4.96, + "grad_norm": 0.9696287512779236, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2170 + }, + { + "epoch": 4.982857142857143, + "grad_norm": 0.9933681488037109, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 2180 + }, + { + "epoch": 4.998857142857143, + "eval_loss": 2.099808692932129, + "eval_runtime": 111.2808, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 2187 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.123581935616e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2187/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3be37d4c6b6f37771b509101bfb565c07e558f89 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06a7478304c7210822cbebd82f6796081559cfcb7ad1b552b11e755a36f17f9 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb33931affd94b426653e6fa523865dee02cf4ab --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd93dda94ea710dc659dc0fb3903f2c404cfb9cf8e62d7bc0408424394559fb +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..160d882adb883975ecb841d9f241c6d568194e9d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f50840728598e0c1ba5051a633704b00606fd9495de49a971236e0ce78e9f88 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ffd52db6f1ca6e0c11191fe35d976561cf706c4 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d26be192efafe0f0679b00ecf85ec3d157ecf6e17a0beab0bc64146c7a8f4c3 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6e7548b48eea941b6a227f36c36a6f00df62eac --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/trainer_state.json @@ -0,0 +1,1915 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 2625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + }, + { + "epoch": 3.0171428571428573, + "grad_norm": 0.5184893012046814, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1320 + }, + { + "epoch": 3.04, + "grad_norm": 0.5665355920791626, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 1330 + }, + { + "epoch": 3.0628571428571427, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.0002, + "loss": 1.3741, + "step": 1340 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 0.6921621561050415, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 1350 + }, + { + "epoch": 3.1085714285714285, + "grad_norm": 0.6406348943710327, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 1360 + }, + { + "epoch": 3.1314285714285712, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002, + "loss": 1.3563, + "step": 1370 + }, + { + "epoch": 3.1542857142857144, + "grad_norm": 0.683325469493866, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 1380 + }, + { + "epoch": 3.177142857142857, + "grad_norm": 0.6686155200004578, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 1390 + }, + { + "epoch": 3.2, + "grad_norm": 0.8159713745117188, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 1400 + }, + { + "epoch": 3.222857142857143, + "grad_norm": 0.646216094493866, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 1410 + }, + { + "epoch": 3.2457142857142856, + "grad_norm": 0.7323529720306396, + "learning_rate": 0.0002, + "loss": 1.4232, + "step": 1420 + }, + { + "epoch": 3.2685714285714287, + "grad_norm": 0.689349353313446, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1430 + }, + { + "epoch": 3.2914285714285714, + "grad_norm": 0.727894127368927, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1440 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.6921590566635132, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 1450 + }, + { + "epoch": 3.337142857142857, + "grad_norm": 0.6176243424415588, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 1460 + }, + { + "epoch": 3.36, + "grad_norm": 0.9006354212760925, + "learning_rate": 0.0002, + "loss": 1.4323, + "step": 1470 + }, + { + "epoch": 3.382857142857143, + "grad_norm": 0.8145929574966431, + "learning_rate": 0.0002, + "loss": 1.4353, + "step": 1480 + }, + { + "epoch": 3.4057142857142857, + "grad_norm": 0.6640016436576843, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1490 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.7266780138015747, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 1500 + }, + { + "epoch": 3.4514285714285715, + "grad_norm": 0.9351356029510498, + "learning_rate": 0.0002, + "loss": 1.4108, + "step": 1510 + }, + { + "epoch": 3.474285714285714, + "grad_norm": 0.675645649433136, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 1520 + }, + { + "epoch": 3.4971428571428573, + "grad_norm": 0.761472225189209, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 1530 + }, + { + "epoch": 3.52, + "grad_norm": 0.6653069257736206, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1540 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.667412519454956, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 1550 + }, + { + "epoch": 3.565714285714286, + "grad_norm": 0.6395593881607056, + "learning_rate": 0.0002, + "loss": 1.4241, + "step": 1560 + }, + { + "epoch": 3.5885714285714285, + "grad_norm": 0.7588621377944946, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1570 + }, + { + "epoch": 3.611428571428571, + "grad_norm": 0.6206456422805786, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 1580 + }, + { + "epoch": 3.6342857142857143, + "grad_norm": 0.7591291666030884, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 1590 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 0.6476313471794128, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 1600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6731392741203308, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 1610 + }, + { + "epoch": 3.702857142857143, + "grad_norm": 0.725190281867981, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 1620 + }, + { + "epoch": 3.725714285714286, + "grad_norm": 0.6720049977302551, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 1630 + }, + { + "epoch": 3.7485714285714287, + "grad_norm": 0.6301007270812988, + "learning_rate": 0.0002, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.715893566608429, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 1650 + }, + { + "epoch": 3.7942857142857145, + "grad_norm": 0.7539359927177429, + "learning_rate": 0.0002, + "loss": 1.3624, + "step": 1660 + }, + { + "epoch": 3.817142857142857, + "grad_norm": 0.6658543348312378, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 1670 + }, + { + "epoch": 3.84, + "grad_norm": 0.7019526958465576, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 1680 + }, + { + "epoch": 3.862857142857143, + "grad_norm": 0.6517802476882935, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 1690 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.7617332935333252, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1700 + }, + { + "epoch": 3.9085714285714284, + "grad_norm": 0.6919480562210083, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 1710 + }, + { + "epoch": 3.9314285714285715, + "grad_norm": 0.6987943053245544, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 1720 + }, + { + "epoch": 3.954285714285714, + "grad_norm": 0.7062228918075562, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1730 + }, + { + "epoch": 3.977142857142857, + "grad_norm": 0.6769542098045349, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 1740 + }, + { + "epoch": 4.0, + "grad_norm": 0.6832144260406494, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 1750 + }, + { + "epoch": 4.0, + "eval_loss": 1.9474865198135376, + "eval_runtime": 111.288, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1750 + }, + { + "epoch": 4.022857142857143, + "grad_norm": 1.064110279083252, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 1760 + }, + { + "epoch": 4.045714285714285, + "grad_norm": 0.8380683660507202, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 1770 + }, + { + "epoch": 4.0685714285714285, + "grad_norm": 1.1863020658493042, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 1780 + }, + { + "epoch": 4.091428571428572, + "grad_norm": 1.0128898620605469, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 1790 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 0.9221312403678894, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 1800 + }, + { + "epoch": 4.137142857142857, + "grad_norm": 1.1298727989196777, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 1810 + }, + { + "epoch": 4.16, + "grad_norm": 0.8854547739028931, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 1820 + }, + { + "epoch": 4.182857142857143, + "grad_norm": 0.8920808434486389, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1830 + }, + { + "epoch": 4.2057142857142855, + "grad_norm": 0.913244366645813, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 1840 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.908831000328064, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1850 + }, + { + "epoch": 4.251428571428572, + "grad_norm": 1.0223685503005981, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 1860 + }, + { + "epoch": 4.274285714285714, + "grad_norm": 0.9771921634674072, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 1870 + }, + { + "epoch": 4.297142857142857, + "grad_norm": 0.9313384890556335, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 1880 + }, + { + "epoch": 4.32, + "grad_norm": 1.0754257440567017, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 1890 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8904672265052795, + "learning_rate": 0.0002, + "loss": 1.2286, + "step": 1900 + }, + { + "epoch": 4.365714285714286, + "grad_norm": 1.046527624130249, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 1910 + }, + { + "epoch": 4.388571428571429, + "grad_norm": 0.9576982855796814, + "learning_rate": 0.0002, + "loss": 1.2368, + "step": 1920 + }, + { + "epoch": 4.411428571428571, + "grad_norm": 0.9278356432914734, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 1930 + }, + { + "epoch": 4.434285714285714, + "grad_norm": 1.1763030290603638, + "learning_rate": 0.0002, + "loss": 1.2005, + "step": 1940 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.9183000326156616, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 1950 + }, + { + "epoch": 4.48, + "grad_norm": 1.050980806350708, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 1960 + }, + { + "epoch": 4.502857142857143, + "grad_norm": 0.9975392818450928, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 1970 + }, + { + "epoch": 4.525714285714286, + "grad_norm": 0.990544319152832, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 1980 + }, + { + "epoch": 4.548571428571429, + "grad_norm": 1.004794955253601, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 1990 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.9294857978820801, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2000 + }, + { + "epoch": 4.594285714285714, + "grad_norm": 0.93436598777771, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 2010 + }, + { + "epoch": 4.617142857142857, + "grad_norm": 0.8704655766487122, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 2020 + }, + { + "epoch": 4.64, + "grad_norm": 0.9077927470207214, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 2030 + }, + { + "epoch": 4.662857142857143, + "grad_norm": 0.912987470626831, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 2040 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.9740643501281738, + "learning_rate": 0.0002, + "loss": 1.2868, + "step": 2050 + }, + { + "epoch": 4.708571428571428, + "grad_norm": 1.133357048034668, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 2060 + }, + { + "epoch": 4.731428571428571, + "grad_norm": 0.8844527006149292, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 2070 + }, + { + "epoch": 4.7542857142857144, + "grad_norm": 1.0083311796188354, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 2080 + }, + { + "epoch": 4.777142857142858, + "grad_norm": 1.000447154045105, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 2090 + }, + { + "epoch": 4.8, + "grad_norm": 0.9620300531387329, + "learning_rate": 0.0002, + "loss": 1.2313, + "step": 2100 + }, + { + "epoch": 4.822857142857143, + "grad_norm": 0.9843335151672363, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 2110 + }, + { + "epoch": 4.845714285714286, + "grad_norm": 0.9906681180000305, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 2120 + }, + { + "epoch": 4.868571428571428, + "grad_norm": 0.9544073939323425, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 2130 + }, + { + "epoch": 4.8914285714285715, + "grad_norm": 0.9392994046211243, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 2140 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 1.104519248008728, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2150 + }, + { + "epoch": 4.937142857142857, + "grad_norm": 0.9495956897735596, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 2160 + }, + { + "epoch": 4.96, + "grad_norm": 0.9696287512779236, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2170 + }, + { + "epoch": 4.982857142857143, + "grad_norm": 0.9933681488037109, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 2180 + }, + { + "epoch": 4.998857142857143, + "eval_loss": 2.099808692932129, + "eval_runtime": 111.2808, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 2187 + }, + { + "epoch": 5.005714285714285, + "grad_norm": 0.9482853412628174, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 2190 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 1.6689555644989014, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 2200 + }, + { + "epoch": 5.051428571428572, + "grad_norm": 1.2019699811935425, + "learning_rate": 0.0002, + "loss": 0.9741, + "step": 2210 + }, + { + "epoch": 5.074285714285715, + "grad_norm": 1.535780429840088, + "learning_rate": 0.0002, + "loss": 0.9737, + "step": 2220 + }, + { + "epoch": 5.097142857142857, + "grad_norm": 1.2061309814453125, + "learning_rate": 0.0002, + "loss": 0.9494, + "step": 2230 + }, + { + "epoch": 5.12, + "grad_norm": 1.1898778676986694, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 2240 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 1.158898949623108, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 2250 + }, + { + "epoch": 5.1657142857142855, + "grad_norm": 1.370749592781067, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 2260 + }, + { + "epoch": 5.188571428571429, + "grad_norm": 1.314120888710022, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 2270 + }, + { + "epoch": 5.211428571428572, + "grad_norm": 1.2184966802597046, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 2280 + }, + { + "epoch": 5.234285714285714, + "grad_norm": 1.4833279848098755, + "learning_rate": 0.0002, + "loss": 0.9407, + "step": 2290 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 1.3348219394683838, + "learning_rate": 0.0002, + "loss": 0.9635, + "step": 2300 + }, + { + "epoch": 5.28, + "grad_norm": 1.4166619777679443, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 2310 + }, + { + "epoch": 5.3028571428571425, + "grad_norm": 1.4539530277252197, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 2320 + }, + { + "epoch": 5.325714285714286, + "grad_norm": 1.4642518758773804, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 2330 + }, + { + "epoch": 5.348571428571429, + "grad_norm": 1.3938848972320557, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 2340 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 1.1147894859313965, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 2350 + }, + { + "epoch": 5.394285714285714, + "grad_norm": 1.3465309143066406, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2360 + }, + { + "epoch": 5.417142857142857, + "grad_norm": 1.4788566827774048, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 2370 + }, + { + "epoch": 5.44, + "grad_norm": 1.3808705806732178, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 2380 + }, + { + "epoch": 5.462857142857143, + "grad_norm": 1.2336329221725464, + "learning_rate": 0.0002, + "loss": 1.0279, + "step": 2390 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 1.5445678234100342, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 2400 + }, + { + "epoch": 5.508571428571429, + "grad_norm": 1.107488989830017, + "learning_rate": 0.0002, + "loss": 0.9534, + "step": 2410 + }, + { + "epoch": 5.531428571428571, + "grad_norm": 1.39687979221344, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 2420 + }, + { + "epoch": 5.554285714285714, + "grad_norm": 1.3905695676803589, + "learning_rate": 0.0002, + "loss": 0.9959, + "step": 2430 + }, + { + "epoch": 5.577142857142857, + "grad_norm": 1.3772821426391602, + "learning_rate": 0.0002, + "loss": 0.9912, + "step": 2440 + }, + { + "epoch": 5.6, + "grad_norm": 1.1661899089813232, + "learning_rate": 0.0002, + "loss": 0.9825, + "step": 2450 + }, + { + "epoch": 5.622857142857143, + "grad_norm": 1.2730463743209839, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 2460 + }, + { + "epoch": 5.645714285714286, + "grad_norm": 1.2251193523406982, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2470 + }, + { + "epoch": 5.668571428571429, + "grad_norm": 1.5454859733581543, + "learning_rate": 0.0002, + "loss": 1.079, + "step": 2480 + }, + { + "epoch": 5.691428571428571, + "grad_norm": 1.5405735969543457, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 2490 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.2555434703826904, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 2500 + }, + { + "epoch": 5.737142857142857, + "grad_norm": 1.3323487043380737, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 2510 + }, + { + "epoch": 5.76, + "grad_norm": 1.3106356859207153, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 2520 + }, + { + "epoch": 5.782857142857143, + "grad_norm": 1.4832439422607422, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 2530 + }, + { + "epoch": 5.805714285714286, + "grad_norm": 1.1336562633514404, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 2540 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 1.2434223890304565, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2550 + }, + { + "epoch": 5.851428571428571, + "grad_norm": 1.2825450897216797, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 2560 + }, + { + "epoch": 5.8742857142857146, + "grad_norm": 1.4373180866241455, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 2570 + }, + { + "epoch": 5.897142857142857, + "grad_norm": 1.435015320777893, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 2580 + }, + { + "epoch": 5.92, + "grad_norm": 1.4075653553009033, + "learning_rate": 0.0002, + "loss": 1.0272, + "step": 2590 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 1.319630742073059, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 2600 + }, + { + "epoch": 5.965714285714286, + "grad_norm": 1.278330683708191, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 2610 + }, + { + "epoch": 5.988571428571428, + "grad_norm": 1.258158564567566, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 2620 + }, + { + "epoch": 6.0, + "eval_loss": 2.3689301013946533, + "eval_runtime": 53.9067, + "eval_samples_per_second": 9.405, + "eval_steps_per_second": 1.187, + "step": 2625 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3482983227392e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-2625/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b5957b8c1cf78a037cd4cb28a91e479b2772d34d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a63a66efe51eade1b9db3a782d99bc39a1e84281614b84989b4348e1f79c779d +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a52a3098e0998ce805cd093bfcf013db26916032 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966121a4ac71b5d4daac8d4447757f2a8760e183f3705e4ad2e32751089035a5 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..05d9d819b10f6fe669fd791f284b855e1b7e643f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d8d725749217e42693110f9b71f15a25465f765c9d4eaa627a8624221589e2 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2036ac3aefac234519f52c8600b40a6386706df9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5da6eb53a5412fee93151405c8188c4782e3828b634238ebc610dcc0daafcddd +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1c7080667b59b789e5c80584eb7744ab50cd5aa4 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/trainer_state.json @@ -0,0 +1,2231 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 6.998857142857143, + "eval_steps": 10, + "global_step": 3062, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + }, + { + "epoch": 3.0171428571428573, + "grad_norm": 0.5184893012046814, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1320 + }, + { + "epoch": 3.04, + "grad_norm": 0.5665355920791626, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 1330 + }, + { + "epoch": 3.0628571428571427, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.0002, + "loss": 1.3741, + "step": 1340 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 0.6921621561050415, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 1350 + }, + { + "epoch": 3.1085714285714285, + "grad_norm": 0.6406348943710327, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 1360 + }, + { + "epoch": 3.1314285714285712, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002, + "loss": 1.3563, + "step": 1370 + }, + { + "epoch": 3.1542857142857144, + "grad_norm": 0.683325469493866, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 1380 + }, + { + "epoch": 3.177142857142857, + "grad_norm": 0.6686155200004578, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 1390 + }, + { + "epoch": 3.2, + "grad_norm": 0.8159713745117188, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 1400 + }, + { + "epoch": 3.222857142857143, + "grad_norm": 0.646216094493866, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 1410 + }, + { + "epoch": 3.2457142857142856, + "grad_norm": 0.7323529720306396, + "learning_rate": 0.0002, + "loss": 1.4232, + "step": 1420 + }, + { + "epoch": 3.2685714285714287, + "grad_norm": 0.689349353313446, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1430 + }, + { + "epoch": 3.2914285714285714, + "grad_norm": 0.727894127368927, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1440 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.6921590566635132, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 1450 + }, + { + "epoch": 3.337142857142857, + "grad_norm": 0.6176243424415588, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 1460 + }, + { + "epoch": 3.36, + "grad_norm": 0.9006354212760925, + "learning_rate": 0.0002, + "loss": 1.4323, + "step": 1470 + }, + { + "epoch": 3.382857142857143, + "grad_norm": 0.8145929574966431, + "learning_rate": 0.0002, + "loss": 1.4353, + "step": 1480 + }, + { + "epoch": 3.4057142857142857, + "grad_norm": 0.6640016436576843, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1490 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.7266780138015747, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 1500 + }, + { + "epoch": 3.4514285714285715, + "grad_norm": 0.9351356029510498, + "learning_rate": 0.0002, + "loss": 1.4108, + "step": 1510 + }, + { + "epoch": 3.474285714285714, + "grad_norm": 0.675645649433136, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 1520 + }, + { + "epoch": 3.4971428571428573, + "grad_norm": 0.761472225189209, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 1530 + }, + { + "epoch": 3.52, + "grad_norm": 0.6653069257736206, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1540 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.667412519454956, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 1550 + }, + { + "epoch": 3.565714285714286, + "grad_norm": 0.6395593881607056, + "learning_rate": 0.0002, + "loss": 1.4241, + "step": 1560 + }, + { + "epoch": 3.5885714285714285, + "grad_norm": 0.7588621377944946, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1570 + }, + { + "epoch": 3.611428571428571, + "grad_norm": 0.6206456422805786, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 1580 + }, + { + "epoch": 3.6342857142857143, + "grad_norm": 0.7591291666030884, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 1590 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 0.6476313471794128, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 1600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6731392741203308, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 1610 + }, + { + "epoch": 3.702857142857143, + "grad_norm": 0.725190281867981, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 1620 + }, + { + "epoch": 3.725714285714286, + "grad_norm": 0.6720049977302551, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 1630 + }, + { + "epoch": 3.7485714285714287, + "grad_norm": 0.6301007270812988, + "learning_rate": 0.0002, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.715893566608429, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 1650 + }, + { + "epoch": 3.7942857142857145, + "grad_norm": 0.7539359927177429, + "learning_rate": 0.0002, + "loss": 1.3624, + "step": 1660 + }, + { + "epoch": 3.817142857142857, + "grad_norm": 0.6658543348312378, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 1670 + }, + { + "epoch": 3.84, + "grad_norm": 0.7019526958465576, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 1680 + }, + { + "epoch": 3.862857142857143, + "grad_norm": 0.6517802476882935, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 1690 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.7617332935333252, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1700 + }, + { + "epoch": 3.9085714285714284, + "grad_norm": 0.6919480562210083, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 1710 + }, + { + "epoch": 3.9314285714285715, + "grad_norm": 0.6987943053245544, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 1720 + }, + { + "epoch": 3.954285714285714, + "grad_norm": 0.7062228918075562, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1730 + }, + { + "epoch": 3.977142857142857, + "grad_norm": 0.6769542098045349, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 1740 + }, + { + "epoch": 4.0, + "grad_norm": 0.6832144260406494, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 1750 + }, + { + "epoch": 4.0, + "eval_loss": 1.9474865198135376, + "eval_runtime": 111.288, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1750 + }, + { + "epoch": 4.022857142857143, + "grad_norm": 1.064110279083252, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 1760 + }, + { + "epoch": 4.045714285714285, + "grad_norm": 0.8380683660507202, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 1770 + }, + { + "epoch": 4.0685714285714285, + "grad_norm": 1.1863020658493042, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 1780 + }, + { + "epoch": 4.091428571428572, + "grad_norm": 1.0128898620605469, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 1790 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 0.9221312403678894, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 1800 + }, + { + "epoch": 4.137142857142857, + "grad_norm": 1.1298727989196777, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 1810 + }, + { + "epoch": 4.16, + "grad_norm": 0.8854547739028931, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 1820 + }, + { + "epoch": 4.182857142857143, + "grad_norm": 0.8920808434486389, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1830 + }, + { + "epoch": 4.2057142857142855, + "grad_norm": 0.913244366645813, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 1840 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.908831000328064, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1850 + }, + { + "epoch": 4.251428571428572, + "grad_norm": 1.0223685503005981, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 1860 + }, + { + "epoch": 4.274285714285714, + "grad_norm": 0.9771921634674072, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 1870 + }, + { + "epoch": 4.297142857142857, + "grad_norm": 0.9313384890556335, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 1880 + }, + { + "epoch": 4.32, + "grad_norm": 1.0754257440567017, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 1890 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8904672265052795, + "learning_rate": 0.0002, + "loss": 1.2286, + "step": 1900 + }, + { + "epoch": 4.365714285714286, + "grad_norm": 1.046527624130249, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 1910 + }, + { + "epoch": 4.388571428571429, + "grad_norm": 0.9576982855796814, + "learning_rate": 0.0002, + "loss": 1.2368, + "step": 1920 + }, + { + "epoch": 4.411428571428571, + "grad_norm": 0.9278356432914734, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 1930 + }, + { + "epoch": 4.434285714285714, + "grad_norm": 1.1763030290603638, + "learning_rate": 0.0002, + "loss": 1.2005, + "step": 1940 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.9183000326156616, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 1950 + }, + { + "epoch": 4.48, + "grad_norm": 1.050980806350708, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 1960 + }, + { + "epoch": 4.502857142857143, + "grad_norm": 0.9975392818450928, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 1970 + }, + { + "epoch": 4.525714285714286, + "grad_norm": 0.990544319152832, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 1980 + }, + { + "epoch": 4.548571428571429, + "grad_norm": 1.004794955253601, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 1990 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.9294857978820801, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2000 + }, + { + "epoch": 4.594285714285714, + "grad_norm": 0.93436598777771, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 2010 + }, + { + "epoch": 4.617142857142857, + "grad_norm": 0.8704655766487122, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 2020 + }, + { + "epoch": 4.64, + "grad_norm": 0.9077927470207214, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 2030 + }, + { + "epoch": 4.662857142857143, + "grad_norm": 0.912987470626831, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 2040 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.9740643501281738, + "learning_rate": 0.0002, + "loss": 1.2868, + "step": 2050 + }, + { + "epoch": 4.708571428571428, + "grad_norm": 1.133357048034668, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 2060 + }, + { + "epoch": 4.731428571428571, + "grad_norm": 0.8844527006149292, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 2070 + }, + { + "epoch": 4.7542857142857144, + "grad_norm": 1.0083311796188354, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 2080 + }, + { + "epoch": 4.777142857142858, + "grad_norm": 1.000447154045105, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 2090 + }, + { + "epoch": 4.8, + "grad_norm": 0.9620300531387329, + "learning_rate": 0.0002, + "loss": 1.2313, + "step": 2100 + }, + { + "epoch": 4.822857142857143, + "grad_norm": 0.9843335151672363, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 2110 + }, + { + "epoch": 4.845714285714286, + "grad_norm": 0.9906681180000305, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 2120 + }, + { + "epoch": 4.868571428571428, + "grad_norm": 0.9544073939323425, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 2130 + }, + { + "epoch": 4.8914285714285715, + "grad_norm": 0.9392994046211243, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 2140 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 1.104519248008728, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2150 + }, + { + "epoch": 4.937142857142857, + "grad_norm": 0.9495956897735596, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 2160 + }, + { + "epoch": 4.96, + "grad_norm": 0.9696287512779236, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2170 + }, + { + "epoch": 4.982857142857143, + "grad_norm": 0.9933681488037109, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 2180 + }, + { + "epoch": 4.998857142857143, + "eval_loss": 2.099808692932129, + "eval_runtime": 111.2808, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 2187 + }, + { + "epoch": 5.005714285714285, + "grad_norm": 0.9482853412628174, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 2190 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 1.6689555644989014, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 2200 + }, + { + "epoch": 5.051428571428572, + "grad_norm": 1.2019699811935425, + "learning_rate": 0.0002, + "loss": 0.9741, + "step": 2210 + }, + { + "epoch": 5.074285714285715, + "grad_norm": 1.535780429840088, + "learning_rate": 0.0002, + "loss": 0.9737, + "step": 2220 + }, + { + "epoch": 5.097142857142857, + "grad_norm": 1.2061309814453125, + "learning_rate": 0.0002, + "loss": 0.9494, + "step": 2230 + }, + { + "epoch": 5.12, + "grad_norm": 1.1898778676986694, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 2240 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 1.158898949623108, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 2250 + }, + { + "epoch": 5.1657142857142855, + "grad_norm": 1.370749592781067, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 2260 + }, + { + "epoch": 5.188571428571429, + "grad_norm": 1.314120888710022, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 2270 + }, + { + "epoch": 5.211428571428572, + "grad_norm": 1.2184966802597046, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 2280 + }, + { + "epoch": 5.234285714285714, + "grad_norm": 1.4833279848098755, + "learning_rate": 0.0002, + "loss": 0.9407, + "step": 2290 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 1.3348219394683838, + "learning_rate": 0.0002, + "loss": 0.9635, + "step": 2300 + }, + { + "epoch": 5.28, + "grad_norm": 1.4166619777679443, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 2310 + }, + { + "epoch": 5.3028571428571425, + "grad_norm": 1.4539530277252197, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 2320 + }, + { + "epoch": 5.325714285714286, + "grad_norm": 1.4642518758773804, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 2330 + }, + { + "epoch": 5.348571428571429, + "grad_norm": 1.3938848972320557, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 2340 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 1.1147894859313965, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 2350 + }, + { + "epoch": 5.394285714285714, + "grad_norm": 1.3465309143066406, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2360 + }, + { + "epoch": 5.417142857142857, + "grad_norm": 1.4788566827774048, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 2370 + }, + { + "epoch": 5.44, + "grad_norm": 1.3808705806732178, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 2380 + }, + { + "epoch": 5.462857142857143, + "grad_norm": 1.2336329221725464, + "learning_rate": 0.0002, + "loss": 1.0279, + "step": 2390 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 1.5445678234100342, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 2400 + }, + { + "epoch": 5.508571428571429, + "grad_norm": 1.107488989830017, + "learning_rate": 0.0002, + "loss": 0.9534, + "step": 2410 + }, + { + "epoch": 5.531428571428571, + "grad_norm": 1.39687979221344, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 2420 + }, + { + "epoch": 5.554285714285714, + "grad_norm": 1.3905695676803589, + "learning_rate": 0.0002, + "loss": 0.9959, + "step": 2430 + }, + { + "epoch": 5.577142857142857, + "grad_norm": 1.3772821426391602, + "learning_rate": 0.0002, + "loss": 0.9912, + "step": 2440 + }, + { + "epoch": 5.6, + "grad_norm": 1.1661899089813232, + "learning_rate": 0.0002, + "loss": 0.9825, + "step": 2450 + }, + { + "epoch": 5.622857142857143, + "grad_norm": 1.2730463743209839, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 2460 + }, + { + "epoch": 5.645714285714286, + "grad_norm": 1.2251193523406982, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2470 + }, + { + "epoch": 5.668571428571429, + "grad_norm": 1.5454859733581543, + "learning_rate": 0.0002, + "loss": 1.079, + "step": 2480 + }, + { + "epoch": 5.691428571428571, + "grad_norm": 1.5405735969543457, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 2490 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.2555434703826904, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 2500 + }, + { + "epoch": 5.737142857142857, + "grad_norm": 1.3323487043380737, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 2510 + }, + { + "epoch": 5.76, + "grad_norm": 1.3106356859207153, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 2520 + }, + { + "epoch": 5.782857142857143, + "grad_norm": 1.4832439422607422, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 2530 + }, + { + "epoch": 5.805714285714286, + "grad_norm": 1.1336562633514404, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 2540 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 1.2434223890304565, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2550 + }, + { + "epoch": 5.851428571428571, + "grad_norm": 1.2825450897216797, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 2560 + }, + { + "epoch": 5.8742857142857146, + "grad_norm": 1.4373180866241455, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 2570 + }, + { + "epoch": 5.897142857142857, + "grad_norm": 1.435015320777893, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 2580 + }, + { + "epoch": 5.92, + "grad_norm": 1.4075653553009033, + "learning_rate": 0.0002, + "loss": 1.0272, + "step": 2590 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 1.319630742073059, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 2600 + }, + { + "epoch": 5.965714285714286, + "grad_norm": 1.278330683708191, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 2610 + }, + { + "epoch": 5.988571428571428, + "grad_norm": 1.258158564567566, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 2620 + }, + { + "epoch": 6.0, + "eval_loss": 2.3689301013946533, + "eval_runtime": 53.9067, + "eval_samples_per_second": 9.405, + "eval_steps_per_second": 1.187, + "step": 2625 + }, + { + "epoch": 6.011428571428572, + "grad_norm": 1.3128368854522705, + "learning_rate": 0.0002, + "loss": 0.9142, + "step": 2630 + }, + { + "epoch": 6.034285714285715, + "grad_norm": 1.4280474185943604, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2640 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 1.5061450004577637, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2650 + }, + { + "epoch": 6.08, + "grad_norm": 1.6013342142105103, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 2660 + }, + { + "epoch": 6.102857142857143, + "grad_norm": 2.0107381343841553, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 2670 + }, + { + "epoch": 6.1257142857142854, + "grad_norm": 1.5010124444961548, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 2680 + }, + { + "epoch": 6.148571428571429, + "grad_norm": 1.5222150087356567, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 2690 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 1.5413103103637695, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2700 + }, + { + "epoch": 6.194285714285714, + "grad_norm": 1.527140736579895, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 2710 + }, + { + "epoch": 6.217142857142857, + "grad_norm": 1.9386590719223022, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 2720 + }, + { + "epoch": 6.24, + "grad_norm": 1.8115214109420776, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 2730 + }, + { + "epoch": 6.2628571428571425, + "grad_norm": 1.6221802234649658, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 2740 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 1.6698768138885498, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 2750 + }, + { + "epoch": 6.308571428571429, + "grad_norm": 1.7960610389709473, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 2760 + }, + { + "epoch": 6.331428571428571, + "grad_norm": 1.32172429561615, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 2770 + }, + { + "epoch": 6.354285714285714, + "grad_norm": 1.7468090057373047, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 2780 + }, + { + "epoch": 6.377142857142857, + "grad_norm": 1.6777397394180298, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 2790 + }, + { + "epoch": 6.4, + "grad_norm": 1.6200671195983887, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2800 + }, + { + "epoch": 6.422857142857143, + "grad_norm": 1.723505973815918, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2810 + }, + { + "epoch": 6.445714285714286, + "grad_norm": 1.4945589303970337, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 2820 + }, + { + "epoch": 6.468571428571429, + "grad_norm": 1.666458010673523, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 2830 + }, + { + "epoch": 6.491428571428571, + "grad_norm": 1.6586525440216064, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 2840 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 1.7480043172836304, + "learning_rate": 0.0002, + "loss": 0.8062, + "step": 2850 + }, + { + "epoch": 6.537142857142857, + "grad_norm": 1.4605649709701538, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 2860 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 1.4841814041137695, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 2870 + }, + { + "epoch": 6.582857142857143, + "grad_norm": 1.4653114080429077, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 2880 + }, + { + "epoch": 6.605714285714286, + "grad_norm": 1.7266837358474731, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2890 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 1.4860098361968994, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 2900 + }, + { + "epoch": 6.651428571428571, + "grad_norm": 1.7177597284317017, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2910 + }, + { + "epoch": 6.674285714285714, + "grad_norm": 1.6757104396820068, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 2920 + }, + { + "epoch": 6.6971428571428575, + "grad_norm": 1.5177433490753174, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 2930 + }, + { + "epoch": 6.72, + "grad_norm": 1.8073889017105103, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 2940 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 1.72337007522583, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2950 + }, + { + "epoch": 6.765714285714286, + "grad_norm": 1.6298240423202515, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 2960 + }, + { + "epoch": 6.788571428571428, + "grad_norm": 1.6140344142913818, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2970 + }, + { + "epoch": 6.811428571428571, + "grad_norm": 1.7180862426757812, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2980 + }, + { + "epoch": 6.8342857142857145, + "grad_norm": 1.7589894533157349, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2990 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 1.780195713043213, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 3000 + }, + { + "epoch": 6.88, + "grad_norm": 1.7182508707046509, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 3010 + }, + { + "epoch": 6.902857142857143, + "grad_norm": 1.6308406591415405, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3020 + }, + { + "epoch": 6.925714285714285, + "grad_norm": 1.5080229043960571, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 3030 + }, + { + "epoch": 6.948571428571428, + "grad_norm": 1.623555064201355, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 3040 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 1.526054859161377, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3050 + }, + { + "epoch": 6.994285714285715, + "grad_norm": 1.6671174764633179, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3060 + }, + { + "epoch": 6.998857142857143, + "eval_loss": 2.647613525390625, + "eval_runtime": 111.2255, + "eval_samples_per_second": 4.558, + "eval_steps_per_second": 0.575, + "step": 3062 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5730147098624e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3062/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2900063d6dcf382201c28a35fb26eec7d38239cb --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c050b52221caafacf459aff714f0d1f0e747a1dcb029bc27d138d6ae22ecdb +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee345e616e86721f680d98615f1d98a76693bab1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a5d2bf5be3b702f111f7a0a3723431a90f8b3eb84348e2c27080a9d62a5c32 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e289d2ab4f0910ba83fcff7962458d8e08d8b89c --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:185518ed415551a71a8c312313b99c2e0eed937898153ee7cae6d3b9d512de56 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b12c82193244f9aef9e538eef6ec5730bf8a0a88 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a6cdc16cfd9c080cdc06d33762e71556a697b7c6f820509ad58b752ec1a3b6 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4a70e57c49993e56457f96bbd3bcf48434c2d55 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/trainer_state.json @@ -0,0 +1,2540 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 7.990857142857143, + "eval_steps": 10, + "global_step": 3496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + }, + { + "epoch": 2.0114285714285716, + "grad_norm": 0.33838793635368347, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 880 + }, + { + "epoch": 2.0342857142857143, + "grad_norm": 0.4219334125518799, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 890 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.43962377309799194, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 900 + }, + { + "epoch": 2.08, + "grad_norm": 0.41956576704978943, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 910 + }, + { + "epoch": 2.1028571428571428, + "grad_norm": 0.4439629912376404, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 920 + }, + { + "epoch": 2.125714285714286, + "grad_norm": 0.43405696749687195, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 930 + }, + { + "epoch": 2.1485714285714286, + "grad_norm": 0.4321737587451935, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 940 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.4689100682735443, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 950 + }, + { + "epoch": 2.1942857142857144, + "grad_norm": 0.47024697065353394, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 960 + }, + { + "epoch": 2.217142857142857, + "grad_norm": 0.4535103440284729, + "learning_rate": 0.0002, + "loss": 1.4703, + "step": 970 + }, + { + "epoch": 2.24, + "grad_norm": 0.45990121364593506, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 980 + }, + { + "epoch": 2.262857142857143, + "grad_norm": 0.48427215218544006, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 990 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.43076643347740173, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1000 + }, + { + "epoch": 2.3085714285714287, + "grad_norm": 0.4854483902454376, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 1010 + }, + { + "epoch": 2.3314285714285714, + "grad_norm": 0.46086496114730835, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 1020 + }, + { + "epoch": 2.354285714285714, + "grad_norm": 0.4714847505092621, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 1030 + }, + { + "epoch": 2.3771428571428572, + "grad_norm": 0.4423409402370453, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 1040 + }, + { + "epoch": 2.4, + "grad_norm": 0.46261295676231384, + "learning_rate": 0.0002, + "loss": 1.5821, + "step": 1050 + }, + { + "epoch": 2.422857142857143, + "grad_norm": 0.4914337396621704, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 1060 + }, + { + "epoch": 2.4457142857142857, + "grad_norm": 0.45144036412239075, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 1070 + }, + { + "epoch": 2.4685714285714284, + "grad_norm": 0.4510825276374817, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 1080 + }, + { + "epoch": 2.4914285714285715, + "grad_norm": 0.48552489280700684, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 1090 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.4768163859844208, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 1100 + }, + { + "epoch": 2.5371428571428574, + "grad_norm": 0.5192609429359436, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 1110 + }, + { + "epoch": 2.56, + "grad_norm": 0.49308598041534424, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 1120 + }, + { + "epoch": 2.5828571428571427, + "grad_norm": 0.5068584084510803, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1130 + }, + { + "epoch": 2.605714285714286, + "grad_norm": 0.4822661280632019, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 1140 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5028144717216492, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 1150 + }, + { + "epoch": 2.6514285714285712, + "grad_norm": 0.48315200209617615, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 1160 + }, + { + "epoch": 2.6742857142857144, + "grad_norm": 0.551934540271759, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 1170 + }, + { + "epoch": 2.697142857142857, + "grad_norm": 0.49223729968070984, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 1180 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.514847457408905, + "learning_rate": 0.0002, + "loss": 1.6382, + "step": 1190 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.4830605387687683, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 1200 + }, + { + "epoch": 2.7657142857142856, + "grad_norm": 0.4584822952747345, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 1210 + }, + { + "epoch": 2.7885714285714287, + "grad_norm": 0.4688762426376343, + "learning_rate": 0.0002, + "loss": 1.6043, + "step": 1220 + }, + { + "epoch": 2.8114285714285714, + "grad_norm": 0.4488156735897064, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1230 + }, + { + "epoch": 2.8342857142857145, + "grad_norm": 0.4700278639793396, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 1240 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5282207131385803, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 1250 + }, + { + "epoch": 2.88, + "grad_norm": 0.4874219000339508, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 1260 + }, + { + "epoch": 2.902857142857143, + "grad_norm": 0.49468332529067993, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 1270 + }, + { + "epoch": 2.9257142857142857, + "grad_norm": 0.49770233035087585, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 1280 + }, + { + "epoch": 2.9485714285714284, + "grad_norm": 0.4433252811431885, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 1290 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.46836379170417786, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 1300 + }, + { + "epoch": 2.994285714285714, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0002, + "loss": 1.5555, + "step": 1310 + }, + { + "epoch": 2.998857142857143, + "eval_loss": 1.8684407472610474, + "eval_runtime": 111.2835, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1312 + }, + { + "epoch": 3.0171428571428573, + "grad_norm": 0.5184893012046814, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 1320 + }, + { + "epoch": 3.04, + "grad_norm": 0.5665355920791626, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 1330 + }, + { + "epoch": 3.0628571428571427, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.0002, + "loss": 1.3741, + "step": 1340 + }, + { + "epoch": 3.085714285714286, + "grad_norm": 0.6921621561050415, + "learning_rate": 0.0002, + "loss": 1.433, + "step": 1350 + }, + { + "epoch": 3.1085714285714285, + "grad_norm": 0.6406348943710327, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 1360 + }, + { + "epoch": 3.1314285714285712, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002, + "loss": 1.3563, + "step": 1370 + }, + { + "epoch": 3.1542857142857144, + "grad_norm": 0.683325469493866, + "learning_rate": 0.0002, + "loss": 1.4096, + "step": 1380 + }, + { + "epoch": 3.177142857142857, + "grad_norm": 0.6686155200004578, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 1390 + }, + { + "epoch": 3.2, + "grad_norm": 0.8159713745117188, + "learning_rate": 0.0002, + "loss": 1.4394, + "step": 1400 + }, + { + "epoch": 3.222857142857143, + "grad_norm": 0.646216094493866, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 1410 + }, + { + "epoch": 3.2457142857142856, + "grad_norm": 0.7323529720306396, + "learning_rate": 0.0002, + "loss": 1.4232, + "step": 1420 + }, + { + "epoch": 3.2685714285714287, + "grad_norm": 0.689349353313446, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 1430 + }, + { + "epoch": 3.2914285714285714, + "grad_norm": 0.727894127368927, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 1440 + }, + { + "epoch": 3.314285714285714, + "grad_norm": 0.6921590566635132, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 1450 + }, + { + "epoch": 3.337142857142857, + "grad_norm": 0.6176243424415588, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 1460 + }, + { + "epoch": 3.36, + "grad_norm": 0.9006354212760925, + "learning_rate": 0.0002, + "loss": 1.4323, + "step": 1470 + }, + { + "epoch": 3.382857142857143, + "grad_norm": 0.8145929574966431, + "learning_rate": 0.0002, + "loss": 1.4353, + "step": 1480 + }, + { + "epoch": 3.4057142857142857, + "grad_norm": 0.6640016436576843, + "learning_rate": 0.0002, + "loss": 1.3859, + "step": 1490 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.7266780138015747, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 1500 + }, + { + "epoch": 3.4514285714285715, + "grad_norm": 0.9351356029510498, + "learning_rate": 0.0002, + "loss": 1.4108, + "step": 1510 + }, + { + "epoch": 3.474285714285714, + "grad_norm": 0.675645649433136, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 1520 + }, + { + "epoch": 3.4971428571428573, + "grad_norm": 0.761472225189209, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 1530 + }, + { + "epoch": 3.52, + "grad_norm": 0.6653069257736206, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1540 + }, + { + "epoch": 3.5428571428571427, + "grad_norm": 0.667412519454956, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 1550 + }, + { + "epoch": 3.565714285714286, + "grad_norm": 0.6395593881607056, + "learning_rate": 0.0002, + "loss": 1.4241, + "step": 1560 + }, + { + "epoch": 3.5885714285714285, + "grad_norm": 0.7588621377944946, + "learning_rate": 0.0002, + "loss": 1.4825, + "step": 1570 + }, + { + "epoch": 3.611428571428571, + "grad_norm": 0.6206456422805786, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 1580 + }, + { + "epoch": 3.6342857142857143, + "grad_norm": 0.7591291666030884, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 1590 + }, + { + "epoch": 3.657142857142857, + "grad_norm": 0.6476313471794128, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 1600 + }, + { + "epoch": 3.68, + "grad_norm": 0.6731392741203308, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 1610 + }, + { + "epoch": 3.702857142857143, + "grad_norm": 0.725190281867981, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 1620 + }, + { + "epoch": 3.725714285714286, + "grad_norm": 0.6720049977302551, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 1630 + }, + { + "epoch": 3.7485714285714287, + "grad_norm": 0.6301007270812988, + "learning_rate": 0.0002, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 3.7714285714285714, + "grad_norm": 0.715893566608429, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 1650 + }, + { + "epoch": 3.7942857142857145, + "grad_norm": 0.7539359927177429, + "learning_rate": 0.0002, + "loss": 1.3624, + "step": 1660 + }, + { + "epoch": 3.817142857142857, + "grad_norm": 0.6658543348312378, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 1670 + }, + { + "epoch": 3.84, + "grad_norm": 0.7019526958465576, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 1680 + }, + { + "epoch": 3.862857142857143, + "grad_norm": 0.6517802476882935, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 1690 + }, + { + "epoch": 3.8857142857142857, + "grad_norm": 0.7617332935333252, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 1700 + }, + { + "epoch": 3.9085714285714284, + "grad_norm": 0.6919480562210083, + "learning_rate": 0.0002, + "loss": 1.5145, + "step": 1710 + }, + { + "epoch": 3.9314285714285715, + "grad_norm": 0.6987943053245544, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 1720 + }, + { + "epoch": 3.954285714285714, + "grad_norm": 0.7062228918075562, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 1730 + }, + { + "epoch": 3.977142857142857, + "grad_norm": 0.6769542098045349, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 1740 + }, + { + "epoch": 4.0, + "grad_norm": 0.6832144260406494, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 1750 + }, + { + "epoch": 4.0, + "eval_loss": 1.9474865198135376, + "eval_runtime": 111.288, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 1750 + }, + { + "epoch": 4.022857142857143, + "grad_norm": 1.064110279083252, + "learning_rate": 0.0002, + "loss": 1.2251, + "step": 1760 + }, + { + "epoch": 4.045714285714285, + "grad_norm": 0.8380683660507202, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 1770 + }, + { + "epoch": 4.0685714285714285, + "grad_norm": 1.1863020658493042, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 1780 + }, + { + "epoch": 4.091428571428572, + "grad_norm": 1.0128898620605469, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 1790 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 0.9221312403678894, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 1800 + }, + { + "epoch": 4.137142857142857, + "grad_norm": 1.1298727989196777, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 1810 + }, + { + "epoch": 4.16, + "grad_norm": 0.8854547739028931, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 1820 + }, + { + "epoch": 4.182857142857143, + "grad_norm": 0.8920808434486389, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1830 + }, + { + "epoch": 4.2057142857142855, + "grad_norm": 0.913244366645813, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 1840 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.908831000328064, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 1850 + }, + { + "epoch": 4.251428571428572, + "grad_norm": 1.0223685503005981, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 1860 + }, + { + "epoch": 4.274285714285714, + "grad_norm": 0.9771921634674072, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 1870 + }, + { + "epoch": 4.297142857142857, + "grad_norm": 0.9313384890556335, + "learning_rate": 0.0002, + "loss": 1.213, + "step": 1880 + }, + { + "epoch": 4.32, + "grad_norm": 1.0754257440567017, + "learning_rate": 0.0002, + "loss": 1.1723, + "step": 1890 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.8904672265052795, + "learning_rate": 0.0002, + "loss": 1.2286, + "step": 1900 + }, + { + "epoch": 4.365714285714286, + "grad_norm": 1.046527624130249, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 1910 + }, + { + "epoch": 4.388571428571429, + "grad_norm": 0.9576982855796814, + "learning_rate": 0.0002, + "loss": 1.2368, + "step": 1920 + }, + { + "epoch": 4.411428571428571, + "grad_norm": 0.9278356432914734, + "learning_rate": 0.0002, + "loss": 1.211, + "step": 1930 + }, + { + "epoch": 4.434285714285714, + "grad_norm": 1.1763030290603638, + "learning_rate": 0.0002, + "loss": 1.2005, + "step": 1940 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.9183000326156616, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 1950 + }, + { + "epoch": 4.48, + "grad_norm": 1.050980806350708, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 1960 + }, + { + "epoch": 4.502857142857143, + "grad_norm": 0.9975392818450928, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 1970 + }, + { + "epoch": 4.525714285714286, + "grad_norm": 0.990544319152832, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 1980 + }, + { + "epoch": 4.548571428571429, + "grad_norm": 1.004794955253601, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 1990 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.9294857978820801, + "learning_rate": 0.0002, + "loss": 1.2085, + "step": 2000 + }, + { + "epoch": 4.594285714285714, + "grad_norm": 0.93436598777771, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 2010 + }, + { + "epoch": 4.617142857142857, + "grad_norm": 0.8704655766487122, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 2020 + }, + { + "epoch": 4.64, + "grad_norm": 0.9077927470207214, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 2030 + }, + { + "epoch": 4.662857142857143, + "grad_norm": 0.912987470626831, + "learning_rate": 0.0002, + "loss": 1.2198, + "step": 2040 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.9740643501281738, + "learning_rate": 0.0002, + "loss": 1.2868, + "step": 2050 + }, + { + "epoch": 4.708571428571428, + "grad_norm": 1.133357048034668, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 2060 + }, + { + "epoch": 4.731428571428571, + "grad_norm": 0.8844527006149292, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 2070 + }, + { + "epoch": 4.7542857142857144, + "grad_norm": 1.0083311796188354, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 2080 + }, + { + "epoch": 4.777142857142858, + "grad_norm": 1.000447154045105, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 2090 + }, + { + "epoch": 4.8, + "grad_norm": 0.9620300531387329, + "learning_rate": 0.0002, + "loss": 1.2313, + "step": 2100 + }, + { + "epoch": 4.822857142857143, + "grad_norm": 0.9843335151672363, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 2110 + }, + { + "epoch": 4.845714285714286, + "grad_norm": 0.9906681180000305, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 2120 + }, + { + "epoch": 4.868571428571428, + "grad_norm": 0.9544073939323425, + "learning_rate": 0.0002, + "loss": 1.2325, + "step": 2130 + }, + { + "epoch": 4.8914285714285715, + "grad_norm": 0.9392994046211243, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 2140 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 1.104519248008728, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 2150 + }, + { + "epoch": 4.937142857142857, + "grad_norm": 0.9495956897735596, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 2160 + }, + { + "epoch": 4.96, + "grad_norm": 0.9696287512779236, + "learning_rate": 0.0002, + "loss": 1.2412, + "step": 2170 + }, + { + "epoch": 4.982857142857143, + "grad_norm": 0.9933681488037109, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 2180 + }, + { + "epoch": 4.998857142857143, + "eval_loss": 2.099808692932129, + "eval_runtime": 111.2808, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.575, + "step": 2187 + }, + { + "epoch": 5.005714285714285, + "grad_norm": 0.9482853412628174, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 2190 + }, + { + "epoch": 5.0285714285714285, + "grad_norm": 1.6689555644989014, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 2200 + }, + { + "epoch": 5.051428571428572, + "grad_norm": 1.2019699811935425, + "learning_rate": 0.0002, + "loss": 0.9741, + "step": 2210 + }, + { + "epoch": 5.074285714285715, + "grad_norm": 1.535780429840088, + "learning_rate": 0.0002, + "loss": 0.9737, + "step": 2220 + }, + { + "epoch": 5.097142857142857, + "grad_norm": 1.2061309814453125, + "learning_rate": 0.0002, + "loss": 0.9494, + "step": 2230 + }, + { + "epoch": 5.12, + "grad_norm": 1.1898778676986694, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 2240 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 1.158898949623108, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 2250 + }, + { + "epoch": 5.1657142857142855, + "grad_norm": 1.370749592781067, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 2260 + }, + { + "epoch": 5.188571428571429, + "grad_norm": 1.314120888710022, + "learning_rate": 0.0002, + "loss": 0.9365, + "step": 2270 + }, + { + "epoch": 5.211428571428572, + "grad_norm": 1.2184966802597046, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 2280 + }, + { + "epoch": 5.234285714285714, + "grad_norm": 1.4833279848098755, + "learning_rate": 0.0002, + "loss": 0.9407, + "step": 2290 + }, + { + "epoch": 5.257142857142857, + "grad_norm": 1.3348219394683838, + "learning_rate": 0.0002, + "loss": 0.9635, + "step": 2300 + }, + { + "epoch": 5.28, + "grad_norm": 1.4166619777679443, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 2310 + }, + { + "epoch": 5.3028571428571425, + "grad_norm": 1.4539530277252197, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 2320 + }, + { + "epoch": 5.325714285714286, + "grad_norm": 1.4642518758773804, + "learning_rate": 0.0002, + "loss": 1.0165, + "step": 2330 + }, + { + "epoch": 5.348571428571429, + "grad_norm": 1.3938848972320557, + "learning_rate": 0.0002, + "loss": 1.0081, + "step": 2340 + }, + { + "epoch": 5.371428571428572, + "grad_norm": 1.1147894859313965, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 2350 + }, + { + "epoch": 5.394285714285714, + "grad_norm": 1.3465309143066406, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2360 + }, + { + "epoch": 5.417142857142857, + "grad_norm": 1.4788566827774048, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 2370 + }, + { + "epoch": 5.44, + "grad_norm": 1.3808705806732178, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 2380 + }, + { + "epoch": 5.462857142857143, + "grad_norm": 1.2336329221725464, + "learning_rate": 0.0002, + "loss": 1.0279, + "step": 2390 + }, + { + "epoch": 5.485714285714286, + "grad_norm": 1.5445678234100342, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 2400 + }, + { + "epoch": 5.508571428571429, + "grad_norm": 1.107488989830017, + "learning_rate": 0.0002, + "loss": 0.9534, + "step": 2410 + }, + { + "epoch": 5.531428571428571, + "grad_norm": 1.39687979221344, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 2420 + }, + { + "epoch": 5.554285714285714, + "grad_norm": 1.3905695676803589, + "learning_rate": 0.0002, + "loss": 0.9959, + "step": 2430 + }, + { + "epoch": 5.577142857142857, + "grad_norm": 1.3772821426391602, + "learning_rate": 0.0002, + "loss": 0.9912, + "step": 2440 + }, + { + "epoch": 5.6, + "grad_norm": 1.1661899089813232, + "learning_rate": 0.0002, + "loss": 0.9825, + "step": 2450 + }, + { + "epoch": 5.622857142857143, + "grad_norm": 1.2730463743209839, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 2460 + }, + { + "epoch": 5.645714285714286, + "grad_norm": 1.2251193523406982, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2470 + }, + { + "epoch": 5.668571428571429, + "grad_norm": 1.5454859733581543, + "learning_rate": 0.0002, + "loss": 1.079, + "step": 2480 + }, + { + "epoch": 5.691428571428571, + "grad_norm": 1.5405735969543457, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 2490 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.2555434703826904, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 2500 + }, + { + "epoch": 5.737142857142857, + "grad_norm": 1.3323487043380737, + "learning_rate": 0.0002, + "loss": 1.0019, + "step": 2510 + }, + { + "epoch": 5.76, + "grad_norm": 1.3106356859207153, + "learning_rate": 0.0002, + "loss": 1.051, + "step": 2520 + }, + { + "epoch": 5.782857142857143, + "grad_norm": 1.4832439422607422, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 2530 + }, + { + "epoch": 5.805714285714286, + "grad_norm": 1.1336562633514404, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 2540 + }, + { + "epoch": 5.828571428571428, + "grad_norm": 1.2434223890304565, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2550 + }, + { + "epoch": 5.851428571428571, + "grad_norm": 1.2825450897216797, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 2560 + }, + { + "epoch": 5.8742857142857146, + "grad_norm": 1.4373180866241455, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 2570 + }, + { + "epoch": 5.897142857142857, + "grad_norm": 1.435015320777893, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 2580 + }, + { + "epoch": 5.92, + "grad_norm": 1.4075653553009033, + "learning_rate": 0.0002, + "loss": 1.0272, + "step": 2590 + }, + { + "epoch": 5.942857142857143, + "grad_norm": 1.319630742073059, + "learning_rate": 0.0002, + "loss": 1.0703, + "step": 2600 + }, + { + "epoch": 5.965714285714286, + "grad_norm": 1.278330683708191, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 2610 + }, + { + "epoch": 5.988571428571428, + "grad_norm": 1.258158564567566, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 2620 + }, + { + "epoch": 6.0, + "eval_loss": 2.3689301013946533, + "eval_runtime": 53.9067, + "eval_samples_per_second": 9.405, + "eval_steps_per_second": 1.187, + "step": 2625 + }, + { + "epoch": 6.011428571428572, + "grad_norm": 1.3128368854522705, + "learning_rate": 0.0002, + "loss": 0.9142, + "step": 2630 + }, + { + "epoch": 6.034285714285715, + "grad_norm": 1.4280474185943604, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2640 + }, + { + "epoch": 6.057142857142857, + "grad_norm": 1.5061450004577637, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 2650 + }, + { + "epoch": 6.08, + "grad_norm": 1.6013342142105103, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 2660 + }, + { + "epoch": 6.102857142857143, + "grad_norm": 2.0107381343841553, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 2670 + }, + { + "epoch": 6.1257142857142854, + "grad_norm": 1.5010124444961548, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 2680 + }, + { + "epoch": 6.148571428571429, + "grad_norm": 1.5222150087356567, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 2690 + }, + { + "epoch": 6.171428571428572, + "grad_norm": 1.5413103103637695, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2700 + }, + { + "epoch": 6.194285714285714, + "grad_norm": 1.527140736579895, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 2710 + }, + { + "epoch": 6.217142857142857, + "grad_norm": 1.9386590719223022, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 2720 + }, + { + "epoch": 6.24, + "grad_norm": 1.8115214109420776, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 2730 + }, + { + "epoch": 6.2628571428571425, + "grad_norm": 1.6221802234649658, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 2740 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 1.6698768138885498, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 2750 + }, + { + "epoch": 6.308571428571429, + "grad_norm": 1.7960610389709473, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 2760 + }, + { + "epoch": 6.331428571428571, + "grad_norm": 1.32172429561615, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 2770 + }, + { + "epoch": 6.354285714285714, + "grad_norm": 1.7468090057373047, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 2780 + }, + { + "epoch": 6.377142857142857, + "grad_norm": 1.6777397394180298, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 2790 + }, + { + "epoch": 6.4, + "grad_norm": 1.6200671195983887, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2800 + }, + { + "epoch": 6.422857142857143, + "grad_norm": 1.723505973815918, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2810 + }, + { + "epoch": 6.445714285714286, + "grad_norm": 1.4945589303970337, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 2820 + }, + { + "epoch": 6.468571428571429, + "grad_norm": 1.666458010673523, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 2830 + }, + { + "epoch": 6.491428571428571, + "grad_norm": 1.6586525440216064, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 2840 + }, + { + "epoch": 6.514285714285714, + "grad_norm": 1.7480043172836304, + "learning_rate": 0.0002, + "loss": 0.8062, + "step": 2850 + }, + { + "epoch": 6.537142857142857, + "grad_norm": 1.4605649709701538, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 2860 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 1.4841814041137695, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 2870 + }, + { + "epoch": 6.582857142857143, + "grad_norm": 1.4653114080429077, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 2880 + }, + { + "epoch": 6.605714285714286, + "grad_norm": 1.7266837358474731, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2890 + }, + { + "epoch": 6.628571428571428, + "grad_norm": 1.4860098361968994, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 2900 + }, + { + "epoch": 6.651428571428571, + "grad_norm": 1.7177597284317017, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2910 + }, + { + "epoch": 6.674285714285714, + "grad_norm": 1.6757104396820068, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 2920 + }, + { + "epoch": 6.6971428571428575, + "grad_norm": 1.5177433490753174, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 2930 + }, + { + "epoch": 6.72, + "grad_norm": 1.8073889017105103, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 2940 + }, + { + "epoch": 6.742857142857143, + "grad_norm": 1.72337007522583, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2950 + }, + { + "epoch": 6.765714285714286, + "grad_norm": 1.6298240423202515, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 2960 + }, + { + "epoch": 6.788571428571428, + "grad_norm": 1.6140344142913818, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2970 + }, + { + "epoch": 6.811428571428571, + "grad_norm": 1.7180862426757812, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2980 + }, + { + "epoch": 6.8342857142857145, + "grad_norm": 1.7589894533157349, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2990 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 1.780195713043213, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 3000 + }, + { + "epoch": 6.88, + "grad_norm": 1.7182508707046509, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 3010 + }, + { + "epoch": 6.902857142857143, + "grad_norm": 1.6308406591415405, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3020 + }, + { + "epoch": 6.925714285714285, + "grad_norm": 1.5080229043960571, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 3030 + }, + { + "epoch": 6.948571428571428, + "grad_norm": 1.623555064201355, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 3040 + }, + { + "epoch": 6.9714285714285715, + "grad_norm": 1.526054859161377, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3050 + }, + { + "epoch": 6.994285714285715, + "grad_norm": 1.6671174764633179, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3060 + }, + { + "epoch": 6.998857142857143, + "eval_loss": 2.647613525390625, + "eval_runtime": 111.2255, + "eval_samples_per_second": 4.558, + "eval_steps_per_second": 0.575, + "step": 3062 + }, + { + "epoch": 7.017142857142857, + "grad_norm": 1.9154540300369263, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 3070 + }, + { + "epoch": 7.04, + "grad_norm": 2.1938717365264893, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 3080 + }, + { + "epoch": 7.062857142857143, + "grad_norm": 1.7861053943634033, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 3090 + }, + { + "epoch": 7.085714285714285, + "grad_norm": 2.096458911895752, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 3100 + }, + { + "epoch": 7.1085714285714285, + "grad_norm": 2.0057616233825684, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 3110 + }, + { + "epoch": 7.131428571428572, + "grad_norm": 1.7073354721069336, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 3120 + }, + { + "epoch": 7.154285714285714, + "grad_norm": 2.3477938175201416, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 3130 + }, + { + "epoch": 7.177142857142857, + "grad_norm": 2.0903899669647217, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 3140 + }, + { + "epoch": 7.2, + "grad_norm": 1.7363157272338867, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 3150 + }, + { + "epoch": 7.222857142857142, + "grad_norm": 2.0611023902893066, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 3160 + }, + { + "epoch": 7.2457142857142856, + "grad_norm": 2.404407501220703, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 3170 + }, + { + "epoch": 7.268571428571429, + "grad_norm": 2.1841039657592773, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 3180 + }, + { + "epoch": 7.291428571428572, + "grad_norm": 1.7582741975784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 3190 + }, + { + "epoch": 7.314285714285714, + "grad_norm": 1.8890602588653564, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 3200 + }, + { + "epoch": 7.337142857142857, + "grad_norm": 1.8433198928833008, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 3210 + }, + { + "epoch": 7.36, + "grad_norm": 1.652266263961792, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 3220 + }, + { + "epoch": 7.382857142857143, + "grad_norm": 1.914348840713501, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 3230 + }, + { + "epoch": 7.405714285714286, + "grad_norm": 1.7440582513809204, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 3240 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 1.9745666980743408, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 3250 + }, + { + "epoch": 7.451428571428571, + "grad_norm": 1.6567715406417847, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 3260 + }, + { + "epoch": 7.474285714285714, + "grad_norm": 1.5239425897598267, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 3270 + }, + { + "epoch": 7.497142857142857, + "grad_norm": 2.0668740272521973, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 3280 + }, + { + "epoch": 7.52, + "grad_norm": 1.9551687240600586, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 3290 + }, + { + "epoch": 7.542857142857143, + "grad_norm": 2.276602268218994, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 3300 + }, + { + "epoch": 7.565714285714286, + "grad_norm": 1.9060227870941162, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 3310 + }, + { + "epoch": 7.588571428571429, + "grad_norm": 2.0276358127593994, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 3320 + }, + { + "epoch": 7.611428571428571, + "grad_norm": 2.037238121032715, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 3330 + }, + { + "epoch": 7.634285714285714, + "grad_norm": 2.0060055255889893, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 3340 + }, + { + "epoch": 7.6571428571428575, + "grad_norm": 1.8366512060165405, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 3350 + }, + { + "epoch": 7.68, + "grad_norm": 2.0789284706115723, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 3360 + }, + { + "epoch": 7.702857142857143, + "grad_norm": 2.137089490890503, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 3370 + }, + { + "epoch": 7.725714285714286, + "grad_norm": 1.829277753829956, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 3380 + }, + { + "epoch": 7.748571428571428, + "grad_norm": 1.9483778476715088, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 3390 + }, + { + "epoch": 7.771428571428571, + "grad_norm": 2.0347065925598145, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 3400 + }, + { + "epoch": 7.7942857142857145, + "grad_norm": 2.0142312049865723, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 3410 + }, + { + "epoch": 7.817142857142857, + "grad_norm": 2.152569055557251, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 3420 + }, + { + "epoch": 7.84, + "grad_norm": 1.7300190925598145, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3430 + }, + { + "epoch": 7.862857142857143, + "grad_norm": 2.3944954872131348, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 3440 + }, + { + "epoch": 7.885714285714286, + "grad_norm": 2.1004269123077393, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 3450 + }, + { + "epoch": 7.908571428571428, + "grad_norm": 2.05513072013855, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 3460 + }, + { + "epoch": 7.9314285714285715, + "grad_norm": 1.9822633266448975, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 3470 + }, + { + "epoch": 7.954285714285715, + "grad_norm": 1.9649063348770142, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 3480 + }, + { + "epoch": 7.977142857142857, + "grad_norm": 1.7002657651901245, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 3490 + }, + { + "epoch": 7.990857142857143, + "eval_loss": 3.0625548362731934, + "eval_runtime": 53.8832, + "eval_samples_per_second": 9.409, + "eval_steps_per_second": 1.188, + "step": 3496 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7956765471604736e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-3496/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fdedd2597ed628b316d57a658eb47e79e78b3be0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35609dd580a3315061cc133ca3b6c33456f1569461f910179abb235a198c055c +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3826f1b6568ff0b417422be811b643e8db3fa8b9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b40ef2a31487bcb182b56d8976aa361fa65f75e01cf80a4077fe981e4b31ae7 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd4d4e551aac7509b0d6c639582b1f4009777fc4 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae52459ca240d2c9a269e617193c16cbe0b127c34a4272b142d33e442e92ea3 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0517455a6d3ba3b19b750a04281120ba36256c6a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5bba0f0dd0346fd33c336ada199442da884b97bac60b1d0fab07049a506ae05 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3aa815973df0fc3a362a201a3f0fbe67053f9e78 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/trainer_state.json @@ -0,0 +1,342 @@ +{ + "best_metric": 1.8310153484344482, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437", + "epoch": 0.9988571428571429, + "eval_steps": 10, + "global_step": 437, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.247163871232e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b971d2d3728c073474494167a8733e971e412cf --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a973c78893d5c5f843df8157e4f5f5d58bd4afd185281397acca339964604cf +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c85437aafa0232354eb71a3750bfc543cac9f55e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b3bf45f015bfc673da78e4ffff095a436715fa4f9e50945493f644b659ee6d +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eae44d3e8693f5b1418d6e94ab297937fbf90ad --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f9ed95d68c1bbf14501d3029a46f035f89ff87248d2476aef8f988288c811c7 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4e46ab564a8a045362ac55dc1e7edd5847d63a0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af1fa39639e63e8234687469d9a055a8928be9f72585a801dd0078003c362e0d +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f7eb452ecd602d25a35ee2fe60aa84ee17cbede0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/trainer_state.json @@ -0,0 +1,658 @@ +{ + "best_metric": 1.8279441595077515, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 875, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.022857142857142857, + "grad_norm": 0.6273946762084961, + "learning_rate": 0.0002, + "loss": 3.066, + "step": 10 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.5300710201263428, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 20 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.6162196397781372, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 30 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.5143047571182251, + "learning_rate": 0.0002, + "loss": 2.1164, + "step": 40 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.4000673294067383, + "learning_rate": 0.0002, + "loss": 1.943, + "step": 50 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.444892555475235, + "learning_rate": 0.0002, + "loss": 1.9531, + "step": 60 + }, + { + "epoch": 0.16, + "grad_norm": 0.4871707558631897, + "learning_rate": 0.0002, + "loss": 1.9435, + "step": 70 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.451060026884079, + "learning_rate": 0.0002, + "loss": 1.9072, + "step": 80 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.3939569592475891, + "learning_rate": 0.0002, + "loss": 1.9312, + "step": 90 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.5033721923828125, + "learning_rate": 0.0002, + "loss": 1.8982, + "step": 100 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0002, + "loss": 1.9148, + "step": 110 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.4391206204891205, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 120 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.5243169665336609, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 0.4055655598640442, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 140 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.0002, + "loss": 1.8348, + "step": 150 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.4696349501609802, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 160 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.3987901508808136, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 170 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.32404327392578125, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 180 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.3692261576652527, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 190 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.37267744541168213, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 200 + }, + { + "epoch": 0.48, + "grad_norm": 0.3559934198856354, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 210 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.3374815285205841, + "learning_rate": 0.0002, + "loss": 1.8651, + "step": 220 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.34598177671432495, + "learning_rate": 0.0002, + "loss": 1.8683, + "step": 230 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.35629919171333313, + "learning_rate": 0.0002, + "loss": 1.8554, + "step": 240 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.3586862087249756, + "learning_rate": 0.0002, + "loss": 1.8751, + "step": 250 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.3198927342891693, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 260 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.37690025568008423, + "learning_rate": 0.0002, + "loss": 1.81, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.2855667471885681, + "learning_rate": 0.0002, + "loss": 1.8258, + "step": 280 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.3242695927619934, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 290 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2960120141506195, + "learning_rate": 0.0002, + "loss": 1.7738, + "step": 300 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.3596384823322296, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 310 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.3001834750175476, + "learning_rate": 0.0002, + "loss": 1.8132, + "step": 320 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.31361159682273865, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 330 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.34093308448791504, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.3383876085281372, + "learning_rate": 0.0002, + "loss": 1.7674, + "step": 350 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.35100996494293213, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 360 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.344976007938385, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 370 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.3119729459285736, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 380 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.349221795797348, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 390 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.3124293386936188, + "learning_rate": 0.0002, + "loss": 1.7529, + "step": 400 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.35504350066185, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 0.310310959815979, + "learning_rate": 0.0002, + "loss": 1.8115, + "step": 420 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.30432847142219543, + "learning_rate": 0.0002, + "loss": 1.7666, + "step": 430 + }, + { + "epoch": 0.9988571428571429, + "eval_loss": 1.8310153484344482, + "eval_runtime": 111.7814, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.573, + "step": 437 + }, + { + "epoch": 1.0057142857142858, + "grad_norm": 0.3121616840362549, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 440 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.3365118205547333, + "learning_rate": 0.0002, + "loss": 1.7404, + "step": 450 + }, + { + "epoch": 1.0514285714285714, + "grad_norm": 0.3626686930656433, + "learning_rate": 0.0002, + "loss": 1.7901, + "step": 460 + }, + { + "epoch": 1.0742857142857143, + "grad_norm": 0.30539533495903015, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 470 + }, + { + "epoch": 1.0971428571428572, + "grad_norm": 0.3159816861152649, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 480 + }, + { + "epoch": 1.12, + "grad_norm": 0.3695855736732483, + "learning_rate": 0.0002, + "loss": 1.7471, + "step": 490 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3609161674976349, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 500 + }, + { + "epoch": 1.1657142857142857, + "grad_norm": 0.3683869242668152, + "learning_rate": 0.0002, + "loss": 1.8723, + "step": 510 + }, + { + "epoch": 1.1885714285714286, + "grad_norm": 0.3862539529800415, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 520 + }, + { + "epoch": 1.2114285714285715, + "grad_norm": 0.4244740307331085, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 530 + }, + { + "epoch": 1.2342857142857142, + "grad_norm": 0.373703271150589, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 540 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.35715773701667786, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 550 + }, + { + "epoch": 1.28, + "grad_norm": 0.3555964231491089, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 560 + }, + { + "epoch": 1.302857142857143, + "grad_norm": 0.35080263018608093, + "learning_rate": 0.0002, + "loss": 1.7228, + "step": 570 + }, + { + "epoch": 1.3257142857142856, + "grad_norm": 0.3589482307434082, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 580 + }, + { + "epoch": 1.3485714285714285, + "grad_norm": 0.3711223900318146, + "learning_rate": 0.0002, + "loss": 1.7369, + "step": 590 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.313614159822464, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 600 + }, + { + "epoch": 1.3942857142857144, + "grad_norm": 0.3842357397079468, + "learning_rate": 0.0002, + "loss": 1.7191, + "step": 610 + }, + { + "epoch": 1.4171428571428573, + "grad_norm": 0.36126819252967834, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 620 + }, + { + "epoch": 1.44, + "grad_norm": 0.35922661423683167, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 630 + }, + { + "epoch": 1.4628571428571429, + "grad_norm": 0.3922875225543976, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 640 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.365546852350235, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 650 + }, + { + "epoch": 1.5085714285714285, + "grad_norm": 0.36107590794563293, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 660 + }, + { + "epoch": 1.5314285714285716, + "grad_norm": 0.3307042121887207, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 670 + }, + { + "epoch": 1.5542857142857143, + "grad_norm": 0.3492133915424347, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 680 + }, + { + "epoch": 1.5771428571428572, + "grad_norm": 0.38608574867248535, + "learning_rate": 0.0002, + "loss": 1.812, + "step": 690 + }, + { + "epoch": 1.6, + "grad_norm": 0.3489173650741577, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 700 + }, + { + "epoch": 1.6228571428571428, + "grad_norm": 0.36614152789115906, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 710 + }, + { + "epoch": 1.6457142857142857, + "grad_norm": 0.34340205788612366, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 720 + }, + { + "epoch": 1.6685714285714286, + "grad_norm": 0.34590771794319153, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 730 + }, + { + "epoch": 1.6914285714285713, + "grad_norm": 0.3759954273700714, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 740 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3753475546836853, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 750 + }, + { + "epoch": 1.737142857142857, + "grad_norm": 0.38416001200675964, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 0.36223554611206055, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 770 + }, + { + "epoch": 1.782857142857143, + "grad_norm": 0.329556941986084, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 780 + }, + { + "epoch": 1.8057142857142856, + "grad_norm": 0.34008052945137024, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 790 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.40297919511795044, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 1.8514285714285714, + "grad_norm": 0.35378390550613403, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 810 + }, + { + "epoch": 1.8742857142857143, + "grad_norm": 0.3625478148460388, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 820 + }, + { + "epoch": 1.8971428571428572, + "grad_norm": 0.36153221130371094, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 0.3612948954105377, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 840 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.399213045835495, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 850 + }, + { + "epoch": 1.9657142857142857, + "grad_norm": 0.40026402473449707, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 860 + }, + { + "epoch": 1.9885714285714284, + "grad_norm": 0.38114118576049805, + "learning_rate": 0.0002, + "loss": 1.76, + "step": 870 + }, + { + "epoch": 2.0, + "eval_loss": 1.8279441595077515, + "eval_runtime": 111.1652, + "eval_samples_per_second": 4.561, + "eval_steps_per_second": 0.576, + "step": 875 + } + ], + "logging_steps": 10, + "max_steps": 3496, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.494327742464e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e8dec0ead2c8272aa4a091c380e4689f369a93e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b24d137580f566a690ee384c6a9afca9277476edf438c9aac06f28988d9d164 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_log.jsonl b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..275174b2c5baca4834779650122bfe11bcce1186 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/training_log.jsonl @@ -0,0 +1,13 @@ +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 2864.7610309123993, "total_accumulated_duration": 2864.7610309123993, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0595, "grad_norm": 0.8717339634895325, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5797, "grad_norm": 0.5135980248451233, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1883, "grad_norm": 0.5807254910469055, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1197, "grad_norm": 0.5022327303886414, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.9424, "grad_norm": 0.39211124181747437, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9487, "grad_norm": 0.4291403591632843, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9404, "grad_norm": 0.43344980478286743, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9033, "grad_norm": 0.45926913619041443, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9333, "grad_norm": 0.46012991666793823, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8955, "grad_norm": 0.5238164067268372, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9122, "grad_norm": 0.39393168687820435, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8442, "grad_norm": 0.3722357749938965, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7979, "grad_norm": 0.4925648272037506, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8865, "grad_norm": 0.3977905213832855, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.834, "grad_norm": 0.34175440669059753, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.8256, "grad_norm": 0.4174216389656067, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7573, "grad_norm": 0.3641279339790344, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.31452980637550354, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8204, "grad_norm": 0.30585241317749023, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.8247, "grad_norm": 0.35243943333625793, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8546, "grad_norm": 0.37146371603012085, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.865, "grad_norm": 0.3326093852519989, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8701, "grad_norm": 0.3610652983188629, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8517, "grad_norm": 0.37059760093688965, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8752, "grad_norm": 0.348164826631546, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7926, "grad_norm": 0.32422614097595215, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.8074, "grad_norm": 0.3642662465572357, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8236, "grad_norm": 0.2940135896205902, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8289, "grad_norm": 0.31184637546539307, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7761, "grad_norm": 0.31996026635169983, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8445, "grad_norm": 0.3466435372829437, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8114, "grad_norm": 0.29803967475891113, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7734, "grad_norm": 0.30836978554725647, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8042, "grad_norm": 0.3359459936618805, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7711, "grad_norm": 0.33156177401542664, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.7289, "grad_norm": 0.34951063990592957, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8496, "grad_norm": 0.3386661112308502, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8789, "grad_norm": 0.3092683255672455, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8021, "grad_norm": 0.3764317035675049, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7513, "grad_norm": 0.32476580142974854, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7613, "grad_norm": 0.33311036229133606, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8133, "grad_norm": 0.29290494322776794, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7675, "grad_norm": 0.30328553915023804, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 680.2093601226807, "total_accumulated_duration": 680.2093601226807, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0737, "grad_norm": 0.676007866859436, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5931, "grad_norm": 0.5250661969184875, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.2149, "grad_norm": 0.5478432774543762, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1497, "grad_norm": 0.4776158034801483, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.9594, "grad_norm": 0.42377957701683044, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9602, "grad_norm": 0.4962829649448395, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9458, "grad_norm": 0.4248361587524414, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9088, "grad_norm": 0.4115945100784302, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9341, "grad_norm": 0.37363702058792114, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8998, "grad_norm": 0.42065081000328064, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9176, "grad_norm": 0.37641531229019165, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8522, "grad_norm": 0.41112732887268066, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7993, "grad_norm": 0.3546443283557892, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8909, "grad_norm": 0.38930970430374146, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8342, "grad_norm": 0.35351401567459106, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.8289, "grad_norm": 0.4080873727798462, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7552, "grad_norm": 0.3885005712509155, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8296, "grad_norm": 0.3329996168613434, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8208, "grad_norm": 0.3085007965564728, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.8251, "grad_norm": 0.34996384382247925, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8534, "grad_norm": 0.35104790329933167, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8658, "grad_norm": 0.40699303150177, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.873, "grad_norm": 0.3361871838569641, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8498, "grad_norm": 0.3529999554157257, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8757, "grad_norm": 0.3695283532142639, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7928, "grad_norm": 0.32378143072128296, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.8093, "grad_norm": 0.38105419278144836, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8238, "grad_norm": 0.29338300228118896, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8324, "grad_norm": 0.3283067047595978, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7762, "grad_norm": 0.31334781646728516, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8455, "grad_norm": 0.32596927881240845, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8107, "grad_norm": 0.3007946312427521, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7752, "grad_norm": 0.31740236282348633, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8079, "grad_norm": 0.357985258102417, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7742, "grad_norm": 0.36722445487976074, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.7309, "grad_norm": 0.3409443497657776, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8493, "grad_norm": 0.35894039273262024, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8798, "grad_norm": 0.3310607969760895, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8057, "grad_norm": 0.3114466071128845, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7537, "grad_norm": 0.30708882212638855, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7645, "grad_norm": 0.33819758892059326, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8153, "grad_norm": 0.30491453409194946, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7678, "grad_norm": 0.3099863529205322, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 1692.0529749393463, "total_accumulated_duration": 1692.0529749393463, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0635, "grad_norm": 0.751778781414032, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5861, "grad_norm": 0.5179391503334045, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.204, "grad_norm": 0.588005006313324, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1343, "grad_norm": 0.4717636704444885, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.9485, "grad_norm": 0.42742547392845154, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9543, "grad_norm": 0.46707019209861755, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9416, "grad_norm": 0.4594128429889679, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9003, "grad_norm": 0.4940229654312134, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9331, "grad_norm": 0.3711043894290924, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8954, "grad_norm": 0.48097744584083557, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9152, "grad_norm": 0.36268872022628784, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8476, "grad_norm": 0.40977758169174194, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.8017, "grad_norm": 0.44437164068222046, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8891, "grad_norm": 0.4007284343242645, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8344, "grad_norm": 0.35068053007125854, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.8242, "grad_norm": 0.3963766098022461, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7578, "grad_norm": 0.34578168392181396, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.3284793794155121, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8207, "grad_norm": 0.33896616101264954, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.8213, "grad_norm": 0.3573452830314636, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8525, "grad_norm": 0.3636205792427063, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8663, "grad_norm": 0.3481282889842987, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8691, "grad_norm": 0.3532394766807556, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8508, "grad_norm": 0.34349769353866577, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8735, "grad_norm": 0.34332799911499023, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7917, "grad_norm": 0.3243716359138489, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.8104, "grad_norm": 0.36064547300338745, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8232, "grad_norm": 0.28934213519096375, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.833, "grad_norm": 0.30864956974983215, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.775, "grad_norm": 0.2911549210548401, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8431, "grad_norm": 0.3338742256164551, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8111, "grad_norm": 0.30065304040908813, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7735, "grad_norm": 0.31962037086486816, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8013, "grad_norm": 0.3339603543281555, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7657, "grad_norm": 0.36495035886764526, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.7268, "grad_norm": 1.9275314807891846, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8487, "grad_norm": 0.3443487882614136, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8788, "grad_norm": 0.32266315817832947, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8039, "grad_norm": 0.366872102022171, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7528, "grad_norm": 0.31905823945999146, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7587, "grad_norm": 0.3308110535144806, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8102, "grad_norm": 0.2978809177875519, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.768, "grad_norm": 0.3174574673175812, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 2183.624102115631, "total_accumulated_duration": 2183.624102115631, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0688, "grad_norm": 0.7712277770042419, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5869, "grad_norm": 0.526583194732666, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1872, "grad_norm": 0.6245124936103821, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1127, "grad_norm": 0.5284358859062195, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.9428, "grad_norm": 0.40955156087875366, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9539, "grad_norm": 0.43062952160835266, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9474, "grad_norm": 0.43123340606689453, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9062, "grad_norm": 0.44926217198371887, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9365, "grad_norm": 0.41470426321029663, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.895, "grad_norm": 0.4223814010620117, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9131, "grad_norm": 0.3610304296016693, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8466, "grad_norm": 0.4295799434185028, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.804, "grad_norm": 0.45776671171188354, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4016251564025879, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8378, "grad_norm": 0.48137062788009644, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.8227, "grad_norm": 0.49422022700309753, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7536, "grad_norm": 0.34034672379493713, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8266, "grad_norm": 0.3166326582431793, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8211, "grad_norm": 0.3084140717983246, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.821, "grad_norm": 0.3594462275505066, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.854, "grad_norm": 0.3633425831794739, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8614, "grad_norm": 0.356911838054657, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8722, "grad_norm": 0.47076764702796936, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8526, "grad_norm": 0.3551470935344696, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8764, "grad_norm": 0.3632679581642151, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7932, "grad_norm": 0.32329922914505005, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.8088, "grad_norm": 0.3659713566303253, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8236, "grad_norm": 0.30823490023612976, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8291, "grad_norm": 0.31695812940597534, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7757, "grad_norm": 0.30165690183639526, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8446, "grad_norm": 0.3283828794956207, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8093, "grad_norm": 0.3035045266151428, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7727, "grad_norm": 0.3053770661354065, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8035, "grad_norm": 0.3487062156200409, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7687, "grad_norm": 0.34441572427749634, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.7269, "grad_norm": 0.35523343086242676, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8515, "grad_norm": 0.3516801595687866, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8815, "grad_norm": 0.31675922870635986, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8073, "grad_norm": 0.3547166883945465, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7517, "grad_norm": 0.3162825405597687, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.765, "grad_norm": 0.3345049023628235, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8113, "grad_norm": 0.30412253737449646, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7697, "grad_norm": 0.3209483027458191, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 1455.278377532959, "total_accumulated_duration": 1455.278377532959, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 2.0, "step": 875, "epoch_duration": 1423.373628616333, "total_accumulated_duration": 2878.652006149292, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-437", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}]} +{"epoch": 2.998857142857143, "step": 1312, "epoch_duration": 1505.2112596035004, "total_accumulated_duration": 4383.863265752792, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}]} +{"epoch": 4.0, "step": 1750, "epoch_duration": 1378.1591472625732, "total_accumulated_duration": 5762.022413015366, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}, {"eval_loss": 1.8684407472610474, "eval_runtime": 111.2835, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 2.998857142857143, "step": 1312}, {"loss": 1.4486, "grad_norm": 0.5184893012046814, "learning_rate": 0.0002, "epoch": 3.0171428571428573, "step": 1320}, {"loss": 1.4082, "grad_norm": 0.5665355920791626, "learning_rate": 0.0002, "epoch": 3.04, "step": 1330}, {"loss": 1.3741, "grad_norm": 0.6601403951644897, "learning_rate": 0.0002, "epoch": 3.0628571428571427, "step": 1340}, {"loss": 1.433, "grad_norm": 0.6921621561050415, "learning_rate": 0.0002, "epoch": 3.085714285714286, "step": 1350}, {"loss": 1.4562, "grad_norm": 0.6406348943710327, "learning_rate": 0.0002, "epoch": 3.1085714285714285, "step": 1360}, {"loss": 1.3563, "grad_norm": 0.5814554691314697, "learning_rate": 0.0002, "epoch": 3.1314285714285712, "step": 1370}, {"loss": 1.4096, "grad_norm": 0.683325469493866, "learning_rate": 0.0002, "epoch": 3.1542857142857144, "step": 1380}, {"loss": 1.4106, "grad_norm": 0.6686155200004578, "learning_rate": 0.0002, "epoch": 3.177142857142857, "step": 1390}, {"loss": 1.4394, "grad_norm": 0.8159713745117188, "learning_rate": 0.0002, "epoch": 3.2, "step": 1400}, {"loss": 1.4279, "grad_norm": 0.646216094493866, "learning_rate": 0.0002, "epoch": 3.222857142857143, "step": 1410}, {"loss": 1.4232, "grad_norm": 0.7323529720306396, "learning_rate": 0.0002, "epoch": 3.2457142857142856, "step": 1420}, {"loss": 1.3891, "grad_norm": 0.689349353313446, "learning_rate": 0.0002, "epoch": 3.2685714285714287, "step": 1430}, {"loss": 1.4578, "grad_norm": 0.727894127368927, "learning_rate": 0.0002, "epoch": 3.2914285714285714, "step": 1440}, {"loss": 1.4, "grad_norm": 0.6921590566635132, "learning_rate": 0.0002, "epoch": 3.314285714285714, "step": 1450}, {"loss": 1.4272, "grad_norm": 0.6176243424415588, "learning_rate": 0.0002, "epoch": 3.337142857142857, "step": 1460}, {"loss": 1.4323, "grad_norm": 0.9006354212760925, "learning_rate": 0.0002, "epoch": 3.36, "step": 1470}, {"loss": 1.4353, "grad_norm": 0.8145929574966431, "learning_rate": 0.0002, "epoch": 3.382857142857143, "step": 1480}, {"loss": 1.3859, "grad_norm": 0.6640016436576843, "learning_rate": 0.0002, "epoch": 3.4057142857142857, "step": 1490}, {"loss": 1.387, "grad_norm": 0.7266780138015747, "learning_rate": 0.0002, "epoch": 3.4285714285714284, "step": 1500}, {"loss": 1.4108, "grad_norm": 0.9351356029510498, "learning_rate": 0.0002, "epoch": 3.4514285714285715, "step": 1510}, {"loss": 1.4656, "grad_norm": 0.675645649433136, "learning_rate": 0.0002, "epoch": 3.474285714285714, "step": 1520}, {"loss": 1.384, "grad_norm": 0.761472225189209, "learning_rate": 0.0002, "epoch": 3.4971428571428573, "step": 1530}, {"loss": 1.4968, "grad_norm": 0.6653069257736206, "learning_rate": 0.0002, "epoch": 3.52, "step": 1540}, {"loss": 1.4686, "grad_norm": 0.667412519454956, "learning_rate": 0.0002, "epoch": 3.5428571428571427, "step": 1550}, {"loss": 1.4241, "grad_norm": 0.6395593881607056, "learning_rate": 0.0002, "epoch": 3.565714285714286, "step": 1560}, {"loss": 1.4825, "grad_norm": 0.7588621377944946, "learning_rate": 0.0002, "epoch": 3.5885714285714285, "step": 1570}, {"loss": 1.4459, "grad_norm": 0.6206456422805786, "learning_rate": 0.0002, "epoch": 3.611428571428571, "step": 1580}, {"loss": 1.436, "grad_norm": 0.7591291666030884, "learning_rate": 0.0002, "epoch": 3.6342857142857143, "step": 1590}, {"loss": 1.458, "grad_norm": 0.6476313471794128, "learning_rate": 0.0002, "epoch": 3.657142857142857, "step": 1600}, {"loss": 1.4598, "grad_norm": 0.6731392741203308, "learning_rate": 0.0002, "epoch": 3.68, "step": 1610}, {"loss": 1.4225, "grad_norm": 0.725190281867981, "learning_rate": 0.0002, "epoch": 3.702857142857143, "step": 1620}, {"loss": 1.4525, "grad_norm": 0.6720049977302551, "learning_rate": 0.0002, "epoch": 3.725714285714286, "step": 1630}, {"loss": 1.429, "grad_norm": 0.6301007270812988, "learning_rate": 0.0002, "epoch": 3.7485714285714287, "step": 1640}, {"loss": 1.4166, "grad_norm": 0.715893566608429, "learning_rate": 0.0002, "epoch": 3.7714285714285714, "step": 1650}, {"loss": 1.3624, "grad_norm": 0.7539359927177429, "learning_rate": 0.0002, "epoch": 3.7942857142857145, "step": 1660}, {"loss": 1.4516, "grad_norm": 0.6658543348312378, "learning_rate": 0.0002, "epoch": 3.817142857142857, "step": 1670}, {"loss": 1.3934, "grad_norm": 0.7019526958465576, "learning_rate": 0.0002, "epoch": 3.84, "step": 1680}, {"loss": 1.4436, "grad_norm": 0.6517802476882935, "learning_rate": 0.0002, "epoch": 3.862857142857143, "step": 1690}, {"loss": 1.4968, "grad_norm": 0.7617332935333252, "learning_rate": 0.0002, "epoch": 3.8857142857142857, "step": 1700}, {"loss": 1.5145, "grad_norm": 0.6919480562210083, "learning_rate": 0.0002, "epoch": 3.9085714285714284, "step": 1710}, {"loss": 1.4317, "grad_norm": 0.6987943053245544, "learning_rate": 0.0002, "epoch": 3.9314285714285715, "step": 1720}, {"loss": 1.4704, "grad_norm": 0.7062228918075562, "learning_rate": 0.0002, "epoch": 3.954285714285714, "step": 1730}, {"loss": 1.4219, "grad_norm": 0.6769542098045349, "learning_rate": 0.0002, "epoch": 3.977142857142857, "step": 1740}, {"loss": 1.4998, "grad_norm": 0.6832144260406494, "learning_rate": 0.0002, "epoch": 4.0, "step": 1750}]} +{"epoch": 4.998857142857143, "step": 2187, "epoch_duration": 1475.801969051361, "total_accumulated_duration": 7237.824382066727, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}, {"eval_loss": 1.8684407472610474, "eval_runtime": 111.2835, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 2.998857142857143, "step": 1312}, {"loss": 1.4486, "grad_norm": 0.5184893012046814, "learning_rate": 0.0002, "epoch": 3.0171428571428573, "step": 1320}, {"loss": 1.4082, "grad_norm": 0.5665355920791626, "learning_rate": 0.0002, "epoch": 3.04, "step": 1330}, {"loss": 1.3741, "grad_norm": 0.6601403951644897, "learning_rate": 0.0002, "epoch": 3.0628571428571427, "step": 1340}, {"loss": 1.433, "grad_norm": 0.6921621561050415, "learning_rate": 0.0002, "epoch": 3.085714285714286, "step": 1350}, {"loss": 1.4562, "grad_norm": 0.6406348943710327, "learning_rate": 0.0002, "epoch": 3.1085714285714285, "step": 1360}, {"loss": 1.3563, "grad_norm": 0.5814554691314697, "learning_rate": 0.0002, "epoch": 3.1314285714285712, "step": 1370}, {"loss": 1.4096, "grad_norm": 0.683325469493866, "learning_rate": 0.0002, "epoch": 3.1542857142857144, "step": 1380}, {"loss": 1.4106, "grad_norm": 0.6686155200004578, "learning_rate": 0.0002, "epoch": 3.177142857142857, "step": 1390}, {"loss": 1.4394, "grad_norm": 0.8159713745117188, "learning_rate": 0.0002, "epoch": 3.2, "step": 1400}, {"loss": 1.4279, "grad_norm": 0.646216094493866, "learning_rate": 0.0002, "epoch": 3.222857142857143, "step": 1410}, {"loss": 1.4232, "grad_norm": 0.7323529720306396, "learning_rate": 0.0002, "epoch": 3.2457142857142856, "step": 1420}, {"loss": 1.3891, "grad_norm": 0.689349353313446, "learning_rate": 0.0002, "epoch": 3.2685714285714287, "step": 1430}, {"loss": 1.4578, "grad_norm": 0.727894127368927, "learning_rate": 0.0002, "epoch": 3.2914285714285714, "step": 1440}, {"loss": 1.4, "grad_norm": 0.6921590566635132, "learning_rate": 0.0002, "epoch": 3.314285714285714, "step": 1450}, {"loss": 1.4272, "grad_norm": 0.6176243424415588, "learning_rate": 0.0002, "epoch": 3.337142857142857, "step": 1460}, {"loss": 1.4323, "grad_norm": 0.9006354212760925, "learning_rate": 0.0002, "epoch": 3.36, "step": 1470}, {"loss": 1.4353, "grad_norm": 0.8145929574966431, "learning_rate": 0.0002, "epoch": 3.382857142857143, "step": 1480}, {"loss": 1.3859, "grad_norm": 0.6640016436576843, "learning_rate": 0.0002, "epoch": 3.4057142857142857, "step": 1490}, {"loss": 1.387, "grad_norm": 0.7266780138015747, "learning_rate": 0.0002, "epoch": 3.4285714285714284, "step": 1500}, {"loss": 1.4108, "grad_norm": 0.9351356029510498, "learning_rate": 0.0002, "epoch": 3.4514285714285715, "step": 1510}, {"loss": 1.4656, "grad_norm": 0.675645649433136, "learning_rate": 0.0002, "epoch": 3.474285714285714, "step": 1520}, {"loss": 1.384, "grad_norm": 0.761472225189209, "learning_rate": 0.0002, "epoch": 3.4971428571428573, "step": 1530}, {"loss": 1.4968, "grad_norm": 0.6653069257736206, "learning_rate": 0.0002, "epoch": 3.52, "step": 1540}, {"loss": 1.4686, "grad_norm": 0.667412519454956, "learning_rate": 0.0002, "epoch": 3.5428571428571427, "step": 1550}, {"loss": 1.4241, "grad_norm": 0.6395593881607056, "learning_rate": 0.0002, "epoch": 3.565714285714286, "step": 1560}, {"loss": 1.4825, "grad_norm": 0.7588621377944946, "learning_rate": 0.0002, "epoch": 3.5885714285714285, "step": 1570}, {"loss": 1.4459, "grad_norm": 0.6206456422805786, "learning_rate": 0.0002, "epoch": 3.611428571428571, "step": 1580}, {"loss": 1.436, "grad_norm": 0.7591291666030884, "learning_rate": 0.0002, "epoch": 3.6342857142857143, "step": 1590}, {"loss": 1.458, "grad_norm": 0.6476313471794128, "learning_rate": 0.0002, "epoch": 3.657142857142857, "step": 1600}, {"loss": 1.4598, "grad_norm": 0.6731392741203308, "learning_rate": 0.0002, "epoch": 3.68, "step": 1610}, {"loss": 1.4225, "grad_norm": 0.725190281867981, "learning_rate": 0.0002, "epoch": 3.702857142857143, "step": 1620}, {"loss": 1.4525, "grad_norm": 0.6720049977302551, "learning_rate": 0.0002, "epoch": 3.725714285714286, "step": 1630}, {"loss": 1.429, "grad_norm": 0.6301007270812988, "learning_rate": 0.0002, "epoch": 3.7485714285714287, "step": 1640}, {"loss": 1.4166, "grad_norm": 0.715893566608429, "learning_rate": 0.0002, "epoch": 3.7714285714285714, "step": 1650}, {"loss": 1.3624, "grad_norm": 0.7539359927177429, "learning_rate": 0.0002, "epoch": 3.7942857142857145, "step": 1660}, {"loss": 1.4516, "grad_norm": 0.6658543348312378, "learning_rate": 0.0002, "epoch": 3.817142857142857, "step": 1670}, {"loss": 1.3934, "grad_norm": 0.7019526958465576, "learning_rate": 0.0002, "epoch": 3.84, "step": 1680}, {"loss": 1.4436, "grad_norm": 0.6517802476882935, "learning_rate": 0.0002, "epoch": 3.862857142857143, "step": 1690}, {"loss": 1.4968, "grad_norm": 0.7617332935333252, "learning_rate": 0.0002, "epoch": 3.8857142857142857, "step": 1700}, {"loss": 1.5145, "grad_norm": 0.6919480562210083, "learning_rate": 0.0002, "epoch": 3.9085714285714284, "step": 1710}, {"loss": 1.4317, "grad_norm": 0.6987943053245544, "learning_rate": 0.0002, "epoch": 3.9314285714285715, "step": 1720}, {"loss": 1.4704, "grad_norm": 0.7062228918075562, "learning_rate": 0.0002, "epoch": 3.954285714285714, "step": 1730}, {"loss": 1.4219, "grad_norm": 0.6769542098045349, "learning_rate": 0.0002, "epoch": 3.977142857142857, "step": 1740}, {"loss": 1.4998, "grad_norm": 0.6832144260406494, "learning_rate": 0.0002, "epoch": 4.0, "step": 1750}, {"eval_loss": 1.9474865198135376, "eval_runtime": 111.288, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 1750}, {"loss": 1.2251, "grad_norm": 1.064110279083252, "learning_rate": 0.0002, "epoch": 4.022857142857143, "step": 1760}, {"loss": 1.2013, "grad_norm": 0.8380683660507202, "learning_rate": 0.0002, "epoch": 4.045714285714285, "step": 1770}, {"loss": 1.2416, "grad_norm": 1.1863020658493042, "learning_rate": 0.0002, "epoch": 4.0685714285714285, "step": 1780}, {"loss": 1.2499, "grad_norm": 1.0128898620605469, "learning_rate": 0.0002, "epoch": 4.091428571428572, "step": 1790}, {"loss": 1.2043, "grad_norm": 0.9221312403678894, "learning_rate": 0.0002, "epoch": 4.114285714285714, "step": 1800}, {"loss": 1.181, "grad_norm": 1.1298727989196777, "learning_rate": 0.0002, "epoch": 4.137142857142857, "step": 1810}, {"loss": 1.1491, "grad_norm": 0.8854547739028931, "learning_rate": 0.0002, "epoch": 4.16, "step": 1820}, {"loss": 1.2156, "grad_norm": 0.8920808434486389, "learning_rate": 0.0002, "epoch": 4.182857142857143, "step": 1830}, {"loss": 1.1969, "grad_norm": 0.913244366645813, "learning_rate": 0.0002, "epoch": 4.2057142857142855, "step": 1840}, {"loss": 1.2156, "grad_norm": 0.908831000328064, "learning_rate": 0.0002, "epoch": 4.228571428571429, "step": 1850}, {"loss": 1.1653, "grad_norm": 1.0223685503005981, "learning_rate": 0.0002, "epoch": 4.251428571428572, "step": 1860}, {"loss": 1.2497, "grad_norm": 0.9771921634674072, "learning_rate": 0.0002, "epoch": 4.274285714285714, "step": 1870}, {"loss": 1.213, "grad_norm": 0.9313384890556335, "learning_rate": 0.0002, "epoch": 4.297142857142857, "step": 1880}, {"loss": 1.1723, "grad_norm": 1.0754257440567017, "learning_rate": 0.0002, "epoch": 4.32, "step": 1890}, {"loss": 1.2286, "grad_norm": 0.8904672265052795, "learning_rate": 0.0002, "epoch": 4.3428571428571425, "step": 1900}, {"loss": 1.2618, "grad_norm": 1.046527624130249, "learning_rate": 0.0002, "epoch": 4.365714285714286, "step": 1910}, {"loss": 1.2368, "grad_norm": 0.9576982855796814, "learning_rate": 0.0002, "epoch": 4.388571428571429, "step": 1920}, {"loss": 1.211, "grad_norm": 0.9278356432914734, "learning_rate": 0.0002, "epoch": 4.411428571428571, "step": 1930}, {"loss": 1.2005, "grad_norm": 1.1763030290603638, "learning_rate": 0.0002, "epoch": 4.434285714285714, "step": 1940}, {"loss": 1.1541, "grad_norm": 0.9183000326156616, "learning_rate": 0.0002, "epoch": 4.457142857142857, "step": 1950}, {"loss": 1.2257, "grad_norm": 1.050980806350708, "learning_rate": 0.0002, "epoch": 4.48, "step": 1960}, {"loss": 1.2133, "grad_norm": 0.9975392818450928, "learning_rate": 0.0002, "epoch": 4.502857142857143, "step": 1970}, {"loss": 1.2312, "grad_norm": 0.990544319152832, "learning_rate": 0.0002, "epoch": 4.525714285714286, "step": 1980}, {"loss": 1.2465, "grad_norm": 1.004794955253601, "learning_rate": 0.0002, "epoch": 4.548571428571429, "step": 1990}, {"loss": 1.2085, "grad_norm": 0.9294857978820801, "learning_rate": 0.0002, "epoch": 4.571428571428571, "step": 2000}, {"loss": 1.2874, "grad_norm": 0.93436598777771, "learning_rate": 0.0002, "epoch": 4.594285714285714, "step": 2010}, {"loss": 1.1965, "grad_norm": 0.8704655766487122, "learning_rate": 0.0002, "epoch": 4.617142857142857, "step": 2020}, {"loss": 1.204, "grad_norm": 0.9077927470207214, "learning_rate": 0.0002, "epoch": 4.64, "step": 2030}, {"loss": 1.2198, "grad_norm": 0.912987470626831, "learning_rate": 0.0002, "epoch": 4.662857142857143, "step": 2040}, {"loss": 1.2868, "grad_norm": 0.9740643501281738, "learning_rate": 0.0002, "epoch": 4.685714285714286, "step": 2050}, {"loss": 1.249, "grad_norm": 1.133357048034668, "learning_rate": 0.0002, "epoch": 4.708571428571428, "step": 2060}, {"loss": 1.1974, "grad_norm": 0.8844527006149292, "learning_rate": 0.0002, "epoch": 4.731428571428571, "step": 2070}, {"loss": 1.2481, "grad_norm": 1.0083311796188354, "learning_rate": 0.0002, "epoch": 4.7542857142857144, "step": 2080}, {"loss": 1.263, "grad_norm": 1.000447154045105, "learning_rate": 0.0002, "epoch": 4.777142857142858, "step": 2090}, {"loss": 1.2313, "grad_norm": 0.9620300531387329, "learning_rate": 0.0002, "epoch": 4.8, "step": 2100}, {"loss": 1.2659, "grad_norm": 0.9843335151672363, "learning_rate": 0.0002, "epoch": 4.822857142857143, "step": 2110}, {"loss": 1.2535, "grad_norm": 0.9906681180000305, "learning_rate": 0.0002, "epoch": 4.845714285714286, "step": 2120}, {"loss": 1.2325, "grad_norm": 0.9544073939323425, "learning_rate": 0.0002, "epoch": 4.868571428571428, "step": 2130}, {"loss": 1.284, "grad_norm": 0.9392994046211243, "learning_rate": 0.0002, "epoch": 4.8914285714285715, "step": 2140}, {"loss": 1.3075, "grad_norm": 1.104519248008728, "learning_rate": 0.0002, "epoch": 4.914285714285715, "step": 2150}, {"loss": 1.2753, "grad_norm": 0.9495956897735596, "learning_rate": 0.0002, "epoch": 4.937142857142857, "step": 2160}, {"loss": 1.2412, "grad_norm": 0.9696287512779236, "learning_rate": 0.0002, "epoch": 4.96, "step": 2170}, {"loss": 1.2354, "grad_norm": 0.9933681488037109, "learning_rate": 0.0002, "epoch": 4.982857142857143, "step": 2180}]} +{"epoch": 6.0, "step": 2625, "epoch_duration": 1401.2988345623016, "total_accumulated_duration": 8639.123216629028, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}, {"eval_loss": 1.8684407472610474, "eval_runtime": 111.2835, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 2.998857142857143, "step": 1312}, {"loss": 1.4486, "grad_norm": 0.5184893012046814, "learning_rate": 0.0002, "epoch": 3.0171428571428573, "step": 1320}, {"loss": 1.4082, "grad_norm": 0.5665355920791626, "learning_rate": 0.0002, "epoch": 3.04, "step": 1330}, {"loss": 1.3741, "grad_norm": 0.6601403951644897, "learning_rate": 0.0002, "epoch": 3.0628571428571427, "step": 1340}, {"loss": 1.433, "grad_norm": 0.6921621561050415, "learning_rate": 0.0002, "epoch": 3.085714285714286, "step": 1350}, {"loss": 1.4562, "grad_norm": 0.6406348943710327, "learning_rate": 0.0002, "epoch": 3.1085714285714285, "step": 1360}, {"loss": 1.3563, "grad_norm": 0.5814554691314697, "learning_rate": 0.0002, "epoch": 3.1314285714285712, "step": 1370}, {"loss": 1.4096, "grad_norm": 0.683325469493866, "learning_rate": 0.0002, "epoch": 3.1542857142857144, "step": 1380}, {"loss": 1.4106, "grad_norm": 0.6686155200004578, "learning_rate": 0.0002, "epoch": 3.177142857142857, "step": 1390}, {"loss": 1.4394, "grad_norm": 0.8159713745117188, "learning_rate": 0.0002, "epoch": 3.2, "step": 1400}, {"loss": 1.4279, "grad_norm": 0.646216094493866, "learning_rate": 0.0002, "epoch": 3.222857142857143, "step": 1410}, {"loss": 1.4232, "grad_norm": 0.7323529720306396, "learning_rate": 0.0002, "epoch": 3.2457142857142856, "step": 1420}, {"loss": 1.3891, "grad_norm": 0.689349353313446, "learning_rate": 0.0002, "epoch": 3.2685714285714287, "step": 1430}, {"loss": 1.4578, "grad_norm": 0.727894127368927, "learning_rate": 0.0002, "epoch": 3.2914285714285714, "step": 1440}, {"loss": 1.4, "grad_norm": 0.6921590566635132, "learning_rate": 0.0002, "epoch": 3.314285714285714, "step": 1450}, {"loss": 1.4272, "grad_norm": 0.6176243424415588, "learning_rate": 0.0002, "epoch": 3.337142857142857, "step": 1460}, {"loss": 1.4323, "grad_norm": 0.9006354212760925, "learning_rate": 0.0002, "epoch": 3.36, "step": 1470}, {"loss": 1.4353, "grad_norm": 0.8145929574966431, "learning_rate": 0.0002, "epoch": 3.382857142857143, "step": 1480}, {"loss": 1.3859, "grad_norm": 0.6640016436576843, "learning_rate": 0.0002, "epoch": 3.4057142857142857, "step": 1490}, {"loss": 1.387, "grad_norm": 0.7266780138015747, "learning_rate": 0.0002, "epoch": 3.4285714285714284, "step": 1500}, {"loss": 1.4108, "grad_norm": 0.9351356029510498, "learning_rate": 0.0002, "epoch": 3.4514285714285715, "step": 1510}, {"loss": 1.4656, "grad_norm": 0.675645649433136, "learning_rate": 0.0002, "epoch": 3.474285714285714, "step": 1520}, {"loss": 1.384, "grad_norm": 0.761472225189209, "learning_rate": 0.0002, "epoch": 3.4971428571428573, "step": 1530}, {"loss": 1.4968, "grad_norm": 0.6653069257736206, "learning_rate": 0.0002, "epoch": 3.52, "step": 1540}, {"loss": 1.4686, "grad_norm": 0.667412519454956, "learning_rate": 0.0002, "epoch": 3.5428571428571427, "step": 1550}, {"loss": 1.4241, "grad_norm": 0.6395593881607056, "learning_rate": 0.0002, "epoch": 3.565714285714286, "step": 1560}, {"loss": 1.4825, "grad_norm": 0.7588621377944946, "learning_rate": 0.0002, "epoch": 3.5885714285714285, "step": 1570}, {"loss": 1.4459, "grad_norm": 0.6206456422805786, "learning_rate": 0.0002, "epoch": 3.611428571428571, "step": 1580}, {"loss": 1.436, "grad_norm": 0.7591291666030884, "learning_rate": 0.0002, "epoch": 3.6342857142857143, "step": 1590}, {"loss": 1.458, "grad_norm": 0.6476313471794128, "learning_rate": 0.0002, "epoch": 3.657142857142857, "step": 1600}, {"loss": 1.4598, "grad_norm": 0.6731392741203308, "learning_rate": 0.0002, "epoch": 3.68, "step": 1610}, {"loss": 1.4225, "grad_norm": 0.725190281867981, "learning_rate": 0.0002, "epoch": 3.702857142857143, "step": 1620}, {"loss": 1.4525, "grad_norm": 0.6720049977302551, "learning_rate": 0.0002, "epoch": 3.725714285714286, "step": 1630}, {"loss": 1.429, "grad_norm": 0.6301007270812988, "learning_rate": 0.0002, "epoch": 3.7485714285714287, "step": 1640}, {"loss": 1.4166, "grad_norm": 0.715893566608429, "learning_rate": 0.0002, "epoch": 3.7714285714285714, "step": 1650}, {"loss": 1.3624, "grad_norm": 0.7539359927177429, "learning_rate": 0.0002, "epoch": 3.7942857142857145, "step": 1660}, {"loss": 1.4516, "grad_norm": 0.6658543348312378, "learning_rate": 0.0002, "epoch": 3.817142857142857, "step": 1670}, {"loss": 1.3934, "grad_norm": 0.7019526958465576, "learning_rate": 0.0002, "epoch": 3.84, "step": 1680}, {"loss": 1.4436, "grad_norm": 0.6517802476882935, "learning_rate": 0.0002, "epoch": 3.862857142857143, "step": 1690}, {"loss": 1.4968, "grad_norm": 0.7617332935333252, "learning_rate": 0.0002, "epoch": 3.8857142857142857, "step": 1700}, {"loss": 1.5145, "grad_norm": 0.6919480562210083, "learning_rate": 0.0002, "epoch": 3.9085714285714284, "step": 1710}, {"loss": 1.4317, "grad_norm": 0.6987943053245544, "learning_rate": 0.0002, "epoch": 3.9314285714285715, "step": 1720}, {"loss": 1.4704, "grad_norm": 0.7062228918075562, "learning_rate": 0.0002, "epoch": 3.954285714285714, "step": 1730}, {"loss": 1.4219, "grad_norm": 0.6769542098045349, "learning_rate": 0.0002, "epoch": 3.977142857142857, "step": 1740}, {"loss": 1.4998, "grad_norm": 0.6832144260406494, "learning_rate": 0.0002, "epoch": 4.0, "step": 1750}, {"eval_loss": 1.9474865198135376, "eval_runtime": 111.288, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 1750}, {"loss": 1.2251, "grad_norm": 1.064110279083252, "learning_rate": 0.0002, "epoch": 4.022857142857143, "step": 1760}, {"loss": 1.2013, "grad_norm": 0.8380683660507202, "learning_rate": 0.0002, "epoch": 4.045714285714285, "step": 1770}, {"loss": 1.2416, "grad_norm": 1.1863020658493042, "learning_rate": 0.0002, "epoch": 4.0685714285714285, "step": 1780}, {"loss": 1.2499, "grad_norm": 1.0128898620605469, "learning_rate": 0.0002, "epoch": 4.091428571428572, "step": 1790}, {"loss": 1.2043, "grad_norm": 0.9221312403678894, "learning_rate": 0.0002, "epoch": 4.114285714285714, "step": 1800}, {"loss": 1.181, "grad_norm": 1.1298727989196777, "learning_rate": 0.0002, "epoch": 4.137142857142857, "step": 1810}, {"loss": 1.1491, "grad_norm": 0.8854547739028931, "learning_rate": 0.0002, "epoch": 4.16, "step": 1820}, {"loss": 1.2156, "grad_norm": 0.8920808434486389, "learning_rate": 0.0002, "epoch": 4.182857142857143, "step": 1830}, {"loss": 1.1969, "grad_norm": 0.913244366645813, "learning_rate": 0.0002, "epoch": 4.2057142857142855, "step": 1840}, {"loss": 1.2156, "grad_norm": 0.908831000328064, "learning_rate": 0.0002, "epoch": 4.228571428571429, "step": 1850}, {"loss": 1.1653, "grad_norm": 1.0223685503005981, "learning_rate": 0.0002, "epoch": 4.251428571428572, "step": 1860}, {"loss": 1.2497, "grad_norm": 0.9771921634674072, "learning_rate": 0.0002, "epoch": 4.274285714285714, "step": 1870}, {"loss": 1.213, "grad_norm": 0.9313384890556335, "learning_rate": 0.0002, "epoch": 4.297142857142857, "step": 1880}, {"loss": 1.1723, "grad_norm": 1.0754257440567017, "learning_rate": 0.0002, "epoch": 4.32, "step": 1890}, {"loss": 1.2286, "grad_norm": 0.8904672265052795, "learning_rate": 0.0002, "epoch": 4.3428571428571425, "step": 1900}, {"loss": 1.2618, "grad_norm": 1.046527624130249, "learning_rate": 0.0002, "epoch": 4.365714285714286, "step": 1910}, {"loss": 1.2368, "grad_norm": 0.9576982855796814, "learning_rate": 0.0002, "epoch": 4.388571428571429, "step": 1920}, {"loss": 1.211, "grad_norm": 0.9278356432914734, "learning_rate": 0.0002, "epoch": 4.411428571428571, "step": 1930}, {"loss": 1.2005, "grad_norm": 1.1763030290603638, "learning_rate": 0.0002, "epoch": 4.434285714285714, "step": 1940}, {"loss": 1.1541, "grad_norm": 0.9183000326156616, "learning_rate": 0.0002, "epoch": 4.457142857142857, "step": 1950}, {"loss": 1.2257, "grad_norm": 1.050980806350708, "learning_rate": 0.0002, "epoch": 4.48, "step": 1960}, {"loss": 1.2133, "grad_norm": 0.9975392818450928, "learning_rate": 0.0002, "epoch": 4.502857142857143, "step": 1970}, {"loss": 1.2312, "grad_norm": 0.990544319152832, "learning_rate": 0.0002, "epoch": 4.525714285714286, "step": 1980}, {"loss": 1.2465, "grad_norm": 1.004794955253601, "learning_rate": 0.0002, "epoch": 4.548571428571429, "step": 1990}, {"loss": 1.2085, "grad_norm": 0.9294857978820801, "learning_rate": 0.0002, "epoch": 4.571428571428571, "step": 2000}, {"loss": 1.2874, "grad_norm": 0.93436598777771, "learning_rate": 0.0002, "epoch": 4.594285714285714, "step": 2010}, {"loss": 1.1965, "grad_norm": 0.8704655766487122, "learning_rate": 0.0002, "epoch": 4.617142857142857, "step": 2020}, {"loss": 1.204, "grad_norm": 0.9077927470207214, "learning_rate": 0.0002, "epoch": 4.64, "step": 2030}, {"loss": 1.2198, "grad_norm": 0.912987470626831, "learning_rate": 0.0002, "epoch": 4.662857142857143, "step": 2040}, {"loss": 1.2868, "grad_norm": 0.9740643501281738, "learning_rate": 0.0002, "epoch": 4.685714285714286, "step": 2050}, {"loss": 1.249, "grad_norm": 1.133357048034668, "learning_rate": 0.0002, "epoch": 4.708571428571428, "step": 2060}, {"loss": 1.1974, "grad_norm": 0.8844527006149292, "learning_rate": 0.0002, "epoch": 4.731428571428571, "step": 2070}, {"loss": 1.2481, "grad_norm": 1.0083311796188354, "learning_rate": 0.0002, "epoch": 4.7542857142857144, "step": 2080}, {"loss": 1.263, "grad_norm": 1.000447154045105, "learning_rate": 0.0002, "epoch": 4.777142857142858, "step": 2090}, {"loss": 1.2313, "grad_norm": 0.9620300531387329, "learning_rate": 0.0002, "epoch": 4.8, "step": 2100}, {"loss": 1.2659, "grad_norm": 0.9843335151672363, "learning_rate": 0.0002, "epoch": 4.822857142857143, "step": 2110}, {"loss": 1.2535, "grad_norm": 0.9906681180000305, "learning_rate": 0.0002, "epoch": 4.845714285714286, "step": 2120}, {"loss": 1.2325, "grad_norm": 0.9544073939323425, "learning_rate": 0.0002, "epoch": 4.868571428571428, "step": 2130}, {"loss": 1.284, "grad_norm": 0.9392994046211243, "learning_rate": 0.0002, "epoch": 4.8914285714285715, "step": 2140}, {"loss": 1.3075, "grad_norm": 1.104519248008728, "learning_rate": 0.0002, "epoch": 4.914285714285715, "step": 2150}, {"loss": 1.2753, "grad_norm": 0.9495956897735596, "learning_rate": 0.0002, "epoch": 4.937142857142857, "step": 2160}, {"loss": 1.2412, "grad_norm": 0.9696287512779236, "learning_rate": 0.0002, "epoch": 4.96, "step": 2170}, {"loss": 1.2354, "grad_norm": 0.9933681488037109, "learning_rate": 0.0002, "epoch": 4.982857142857143, "step": 2180}, {"eval_loss": 2.099808692932129, "eval_runtime": 111.2808, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.998857142857143, "step": 2187}, {"loss": 1.2183, "grad_norm": 0.9482853412628174, "learning_rate": 0.0002, "epoch": 5.005714285714285, "step": 2190}, {"loss": 0.9898, "grad_norm": 1.6689555644989014, "learning_rate": 0.0002, "epoch": 5.0285714285714285, "step": 2200}, {"loss": 0.9741, "grad_norm": 1.2019699811935425, "learning_rate": 0.0002, "epoch": 5.051428571428572, "step": 2210}, {"loss": 0.9737, "grad_norm": 1.535780429840088, "learning_rate": 0.0002, "epoch": 5.074285714285715, "step": 2220}, {"loss": 0.9494, "grad_norm": 1.2061309814453125, "learning_rate": 0.0002, "epoch": 5.097142857142857, "step": 2230}, {"loss": 0.9316, "grad_norm": 1.1898778676986694, "learning_rate": 0.0002, "epoch": 5.12, "step": 2240}, {"loss": 1.002, "grad_norm": 1.158898949623108, "learning_rate": 0.0002, "epoch": 5.142857142857143, "step": 2250}, {"loss": 0.9715, "grad_norm": 1.370749592781067, "learning_rate": 0.0002, "epoch": 5.1657142857142855, "step": 2260}, {"loss": 0.9365, "grad_norm": 1.314120888710022, "learning_rate": 0.0002, "epoch": 5.188571428571429, "step": 2270}, {"loss": 1.0316, "grad_norm": 1.2184966802597046, "learning_rate": 0.0002, "epoch": 5.211428571428572, "step": 2280}, {"loss": 0.9407, "grad_norm": 1.4833279848098755, "learning_rate": 0.0002, "epoch": 5.234285714285714, "step": 2290}, {"loss": 0.9635, "grad_norm": 1.3348219394683838, "learning_rate": 0.0002, "epoch": 5.257142857142857, "step": 2300}, {"loss": 1.0294, "grad_norm": 1.4166619777679443, "learning_rate": 0.0002, "epoch": 5.28, "step": 2310}, {"loss": 0.9818, "grad_norm": 1.4539530277252197, "learning_rate": 0.0002, "epoch": 5.3028571428571425, "step": 2320}, {"loss": 1.0165, "grad_norm": 1.4642518758773804, "learning_rate": 0.0002, "epoch": 5.325714285714286, "step": 2330}, {"loss": 1.0081, "grad_norm": 1.3938848972320557, "learning_rate": 0.0002, "epoch": 5.348571428571429, "step": 2340}, {"loss": 1.03, "grad_norm": 1.1147894859313965, "learning_rate": 0.0002, "epoch": 5.371428571428572, "step": 2350}, {"loss": 0.9975, "grad_norm": 1.3465309143066406, "learning_rate": 0.0002, "epoch": 5.394285714285714, "step": 2360}, {"loss": 1.0138, "grad_norm": 1.4788566827774048, "learning_rate": 0.0002, "epoch": 5.417142857142857, "step": 2370}, {"loss": 0.9896, "grad_norm": 1.3808705806732178, "learning_rate": 0.0002, "epoch": 5.44, "step": 2380}, {"loss": 1.0279, "grad_norm": 1.2336329221725464, "learning_rate": 0.0002, "epoch": 5.462857142857143, "step": 2390}, {"loss": 0.9763, "grad_norm": 1.5445678234100342, "learning_rate": 0.0002, "epoch": 5.485714285714286, "step": 2400}, {"loss": 0.9534, "grad_norm": 1.107488989830017, "learning_rate": 0.0002, "epoch": 5.508571428571429, "step": 2410}, {"loss": 1.0036, "grad_norm": 1.39687979221344, "learning_rate": 0.0002, "epoch": 5.531428571428571, "step": 2420}, {"loss": 0.9959, "grad_norm": 1.3905695676803589, "learning_rate": 0.0002, "epoch": 5.554285714285714, "step": 2430}, {"loss": 0.9912, "grad_norm": 1.3772821426391602, "learning_rate": 0.0002, "epoch": 5.577142857142857, "step": 2440}, {"loss": 0.9825, "grad_norm": 1.1661899089813232, "learning_rate": 0.0002, "epoch": 5.6, "step": 2450}, {"loss": 1.0003, "grad_norm": 1.2730463743209839, "learning_rate": 0.0002, "epoch": 5.622857142857143, "step": 2460}, {"loss": 1.0433, "grad_norm": 1.2251193523406982, "learning_rate": 0.0002, "epoch": 5.645714285714286, "step": 2470}, {"loss": 1.079, "grad_norm": 1.5454859733581543, "learning_rate": 0.0002, "epoch": 5.668571428571429, "step": 2480}, {"loss": 1.0414, "grad_norm": 1.5405735969543457, "learning_rate": 0.0002, "epoch": 5.691428571428571, "step": 2490}, {"loss": 1.0353, "grad_norm": 1.2555434703826904, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 2500}, {"loss": 1.0019, "grad_norm": 1.3323487043380737, "learning_rate": 0.0002, "epoch": 5.737142857142857, "step": 2510}, {"loss": 1.051, "grad_norm": 1.3106356859207153, "learning_rate": 0.0002, "epoch": 5.76, "step": 2520}, {"loss": 1.0248, "grad_norm": 1.4832439422607422, "learning_rate": 0.0002, "epoch": 5.782857142857143, "step": 2530}, {"loss": 1.0643, "grad_norm": 1.1336562633514404, "learning_rate": 0.0002, "epoch": 5.805714285714286, "step": 2540}, {"loss": 1.0446, "grad_norm": 1.2434223890304565, "learning_rate": 0.0002, "epoch": 5.828571428571428, "step": 2550}, {"loss": 1.0467, "grad_norm": 1.2825450897216797, "learning_rate": 0.0002, "epoch": 5.851428571428571, "step": 2560}, {"loss": 1.0642, "grad_norm": 1.4373180866241455, "learning_rate": 0.0002, "epoch": 5.8742857142857146, "step": 2570}, {"loss": 1.0814, "grad_norm": 1.435015320777893, "learning_rate": 0.0002, "epoch": 5.897142857142857, "step": 2580}, {"loss": 1.0272, "grad_norm": 1.4075653553009033, "learning_rate": 0.0002, "epoch": 5.92, "step": 2590}, {"loss": 1.0703, "grad_norm": 1.319630742073059, "learning_rate": 0.0002, "epoch": 5.942857142857143, "step": 2600}, {"loss": 1.0375, "grad_norm": 1.278330683708191, "learning_rate": 0.0002, "epoch": 5.965714285714286, "step": 2610}, {"loss": 1.0766, "grad_norm": 1.258158564567566, "learning_rate": 0.0002, "epoch": 5.988571428571428, "step": 2620}]} +{"epoch": 6.998857142857143, "step": 3062, "epoch_duration": 774.590334892273, "total_accumulated_duration": 9413.713551521301, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}, {"eval_loss": 1.8684407472610474, "eval_runtime": 111.2835, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 2.998857142857143, "step": 1312}, {"loss": 1.4486, "grad_norm": 0.5184893012046814, "learning_rate": 0.0002, "epoch": 3.0171428571428573, "step": 1320}, {"loss": 1.4082, "grad_norm": 0.5665355920791626, "learning_rate": 0.0002, "epoch": 3.04, "step": 1330}, {"loss": 1.3741, "grad_norm": 0.6601403951644897, "learning_rate": 0.0002, "epoch": 3.0628571428571427, "step": 1340}, {"loss": 1.433, "grad_norm": 0.6921621561050415, "learning_rate": 0.0002, "epoch": 3.085714285714286, "step": 1350}, {"loss": 1.4562, "grad_norm": 0.6406348943710327, "learning_rate": 0.0002, "epoch": 3.1085714285714285, "step": 1360}, {"loss": 1.3563, "grad_norm": 0.5814554691314697, "learning_rate": 0.0002, "epoch": 3.1314285714285712, "step": 1370}, {"loss": 1.4096, "grad_norm": 0.683325469493866, "learning_rate": 0.0002, "epoch": 3.1542857142857144, "step": 1380}, {"loss": 1.4106, "grad_norm": 0.6686155200004578, "learning_rate": 0.0002, "epoch": 3.177142857142857, "step": 1390}, {"loss": 1.4394, "grad_norm": 0.8159713745117188, "learning_rate": 0.0002, "epoch": 3.2, "step": 1400}, {"loss": 1.4279, "grad_norm": 0.646216094493866, "learning_rate": 0.0002, "epoch": 3.222857142857143, "step": 1410}, {"loss": 1.4232, "grad_norm": 0.7323529720306396, "learning_rate": 0.0002, "epoch": 3.2457142857142856, "step": 1420}, {"loss": 1.3891, "grad_norm": 0.689349353313446, "learning_rate": 0.0002, "epoch": 3.2685714285714287, "step": 1430}, {"loss": 1.4578, "grad_norm": 0.727894127368927, "learning_rate": 0.0002, "epoch": 3.2914285714285714, "step": 1440}, {"loss": 1.4, "grad_norm": 0.6921590566635132, "learning_rate": 0.0002, "epoch": 3.314285714285714, "step": 1450}, {"loss": 1.4272, "grad_norm": 0.6176243424415588, "learning_rate": 0.0002, "epoch": 3.337142857142857, "step": 1460}, {"loss": 1.4323, "grad_norm": 0.9006354212760925, "learning_rate": 0.0002, "epoch": 3.36, "step": 1470}, {"loss": 1.4353, "grad_norm": 0.8145929574966431, "learning_rate": 0.0002, "epoch": 3.382857142857143, "step": 1480}, {"loss": 1.3859, "grad_norm": 0.6640016436576843, "learning_rate": 0.0002, "epoch": 3.4057142857142857, "step": 1490}, {"loss": 1.387, "grad_norm": 0.7266780138015747, "learning_rate": 0.0002, "epoch": 3.4285714285714284, "step": 1500}, {"loss": 1.4108, "grad_norm": 0.9351356029510498, "learning_rate": 0.0002, "epoch": 3.4514285714285715, "step": 1510}, {"loss": 1.4656, "grad_norm": 0.675645649433136, "learning_rate": 0.0002, "epoch": 3.474285714285714, "step": 1520}, {"loss": 1.384, "grad_norm": 0.761472225189209, "learning_rate": 0.0002, "epoch": 3.4971428571428573, "step": 1530}, {"loss": 1.4968, "grad_norm": 0.6653069257736206, "learning_rate": 0.0002, "epoch": 3.52, "step": 1540}, {"loss": 1.4686, "grad_norm": 0.667412519454956, "learning_rate": 0.0002, "epoch": 3.5428571428571427, "step": 1550}, {"loss": 1.4241, "grad_norm": 0.6395593881607056, "learning_rate": 0.0002, "epoch": 3.565714285714286, "step": 1560}, {"loss": 1.4825, "grad_norm": 0.7588621377944946, "learning_rate": 0.0002, "epoch": 3.5885714285714285, "step": 1570}, {"loss": 1.4459, "grad_norm": 0.6206456422805786, "learning_rate": 0.0002, "epoch": 3.611428571428571, "step": 1580}, {"loss": 1.436, "grad_norm": 0.7591291666030884, "learning_rate": 0.0002, "epoch": 3.6342857142857143, "step": 1590}, {"loss": 1.458, "grad_norm": 0.6476313471794128, "learning_rate": 0.0002, "epoch": 3.657142857142857, "step": 1600}, {"loss": 1.4598, "grad_norm": 0.6731392741203308, "learning_rate": 0.0002, "epoch": 3.68, "step": 1610}, {"loss": 1.4225, "grad_norm": 0.725190281867981, "learning_rate": 0.0002, "epoch": 3.702857142857143, "step": 1620}, {"loss": 1.4525, "grad_norm": 0.6720049977302551, "learning_rate": 0.0002, "epoch": 3.725714285714286, "step": 1630}, {"loss": 1.429, "grad_norm": 0.6301007270812988, "learning_rate": 0.0002, "epoch": 3.7485714285714287, "step": 1640}, {"loss": 1.4166, "grad_norm": 0.715893566608429, "learning_rate": 0.0002, "epoch": 3.7714285714285714, "step": 1650}, {"loss": 1.3624, "grad_norm": 0.7539359927177429, "learning_rate": 0.0002, "epoch": 3.7942857142857145, "step": 1660}, {"loss": 1.4516, "grad_norm": 0.6658543348312378, "learning_rate": 0.0002, "epoch": 3.817142857142857, "step": 1670}, {"loss": 1.3934, "grad_norm": 0.7019526958465576, "learning_rate": 0.0002, "epoch": 3.84, "step": 1680}, {"loss": 1.4436, "grad_norm": 0.6517802476882935, "learning_rate": 0.0002, "epoch": 3.862857142857143, "step": 1690}, {"loss": 1.4968, "grad_norm": 0.7617332935333252, "learning_rate": 0.0002, "epoch": 3.8857142857142857, "step": 1700}, {"loss": 1.5145, "grad_norm": 0.6919480562210083, "learning_rate": 0.0002, "epoch": 3.9085714285714284, "step": 1710}, {"loss": 1.4317, "grad_norm": 0.6987943053245544, "learning_rate": 0.0002, "epoch": 3.9314285714285715, "step": 1720}, {"loss": 1.4704, "grad_norm": 0.7062228918075562, "learning_rate": 0.0002, "epoch": 3.954285714285714, "step": 1730}, {"loss": 1.4219, "grad_norm": 0.6769542098045349, "learning_rate": 0.0002, "epoch": 3.977142857142857, "step": 1740}, {"loss": 1.4998, "grad_norm": 0.6832144260406494, "learning_rate": 0.0002, "epoch": 4.0, "step": 1750}, {"eval_loss": 1.9474865198135376, "eval_runtime": 111.288, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 1750}, {"loss": 1.2251, "grad_norm": 1.064110279083252, "learning_rate": 0.0002, "epoch": 4.022857142857143, "step": 1760}, {"loss": 1.2013, "grad_norm": 0.8380683660507202, "learning_rate": 0.0002, "epoch": 4.045714285714285, "step": 1770}, {"loss": 1.2416, "grad_norm": 1.1863020658493042, "learning_rate": 0.0002, "epoch": 4.0685714285714285, "step": 1780}, {"loss": 1.2499, "grad_norm": 1.0128898620605469, "learning_rate": 0.0002, "epoch": 4.091428571428572, "step": 1790}, {"loss": 1.2043, "grad_norm": 0.9221312403678894, "learning_rate": 0.0002, "epoch": 4.114285714285714, "step": 1800}, {"loss": 1.181, "grad_norm": 1.1298727989196777, "learning_rate": 0.0002, "epoch": 4.137142857142857, "step": 1810}, {"loss": 1.1491, "grad_norm": 0.8854547739028931, "learning_rate": 0.0002, "epoch": 4.16, "step": 1820}, {"loss": 1.2156, "grad_norm": 0.8920808434486389, "learning_rate": 0.0002, "epoch": 4.182857142857143, "step": 1830}, {"loss": 1.1969, "grad_norm": 0.913244366645813, "learning_rate": 0.0002, "epoch": 4.2057142857142855, "step": 1840}, {"loss": 1.2156, "grad_norm": 0.908831000328064, "learning_rate": 0.0002, "epoch": 4.228571428571429, "step": 1850}, {"loss": 1.1653, "grad_norm": 1.0223685503005981, "learning_rate": 0.0002, "epoch": 4.251428571428572, "step": 1860}, {"loss": 1.2497, "grad_norm": 0.9771921634674072, "learning_rate": 0.0002, "epoch": 4.274285714285714, "step": 1870}, {"loss": 1.213, "grad_norm": 0.9313384890556335, "learning_rate": 0.0002, "epoch": 4.297142857142857, "step": 1880}, {"loss": 1.1723, "grad_norm": 1.0754257440567017, "learning_rate": 0.0002, "epoch": 4.32, "step": 1890}, {"loss": 1.2286, "grad_norm": 0.8904672265052795, "learning_rate": 0.0002, "epoch": 4.3428571428571425, "step": 1900}, {"loss": 1.2618, "grad_norm": 1.046527624130249, "learning_rate": 0.0002, "epoch": 4.365714285714286, "step": 1910}, {"loss": 1.2368, "grad_norm": 0.9576982855796814, "learning_rate": 0.0002, "epoch": 4.388571428571429, "step": 1920}, {"loss": 1.211, "grad_norm": 0.9278356432914734, "learning_rate": 0.0002, "epoch": 4.411428571428571, "step": 1930}, {"loss": 1.2005, "grad_norm": 1.1763030290603638, "learning_rate": 0.0002, "epoch": 4.434285714285714, "step": 1940}, {"loss": 1.1541, "grad_norm": 0.9183000326156616, "learning_rate": 0.0002, "epoch": 4.457142857142857, "step": 1950}, {"loss": 1.2257, "grad_norm": 1.050980806350708, "learning_rate": 0.0002, "epoch": 4.48, "step": 1960}, {"loss": 1.2133, "grad_norm": 0.9975392818450928, "learning_rate": 0.0002, "epoch": 4.502857142857143, "step": 1970}, {"loss": 1.2312, "grad_norm": 0.990544319152832, "learning_rate": 0.0002, "epoch": 4.525714285714286, "step": 1980}, {"loss": 1.2465, "grad_norm": 1.004794955253601, "learning_rate": 0.0002, "epoch": 4.548571428571429, "step": 1990}, {"loss": 1.2085, "grad_norm": 0.9294857978820801, "learning_rate": 0.0002, "epoch": 4.571428571428571, "step": 2000}, {"loss": 1.2874, "grad_norm": 0.93436598777771, "learning_rate": 0.0002, "epoch": 4.594285714285714, "step": 2010}, {"loss": 1.1965, "grad_norm": 0.8704655766487122, "learning_rate": 0.0002, "epoch": 4.617142857142857, "step": 2020}, {"loss": 1.204, "grad_norm": 0.9077927470207214, "learning_rate": 0.0002, "epoch": 4.64, "step": 2030}, {"loss": 1.2198, "grad_norm": 0.912987470626831, "learning_rate": 0.0002, "epoch": 4.662857142857143, "step": 2040}, {"loss": 1.2868, "grad_norm": 0.9740643501281738, "learning_rate": 0.0002, "epoch": 4.685714285714286, "step": 2050}, {"loss": 1.249, "grad_norm": 1.133357048034668, "learning_rate": 0.0002, "epoch": 4.708571428571428, "step": 2060}, {"loss": 1.1974, "grad_norm": 0.8844527006149292, "learning_rate": 0.0002, "epoch": 4.731428571428571, "step": 2070}, {"loss": 1.2481, "grad_norm": 1.0083311796188354, "learning_rate": 0.0002, "epoch": 4.7542857142857144, "step": 2080}, {"loss": 1.263, "grad_norm": 1.000447154045105, "learning_rate": 0.0002, "epoch": 4.777142857142858, "step": 2090}, {"loss": 1.2313, "grad_norm": 0.9620300531387329, "learning_rate": 0.0002, "epoch": 4.8, "step": 2100}, {"loss": 1.2659, "grad_norm": 0.9843335151672363, "learning_rate": 0.0002, "epoch": 4.822857142857143, "step": 2110}, {"loss": 1.2535, "grad_norm": 0.9906681180000305, "learning_rate": 0.0002, "epoch": 4.845714285714286, "step": 2120}, {"loss": 1.2325, "grad_norm": 0.9544073939323425, "learning_rate": 0.0002, "epoch": 4.868571428571428, "step": 2130}, {"loss": 1.284, "grad_norm": 0.9392994046211243, "learning_rate": 0.0002, "epoch": 4.8914285714285715, "step": 2140}, {"loss": 1.3075, "grad_norm": 1.104519248008728, "learning_rate": 0.0002, "epoch": 4.914285714285715, "step": 2150}, {"loss": 1.2753, "grad_norm": 0.9495956897735596, "learning_rate": 0.0002, "epoch": 4.937142857142857, "step": 2160}, {"loss": 1.2412, "grad_norm": 0.9696287512779236, "learning_rate": 0.0002, "epoch": 4.96, "step": 2170}, {"loss": 1.2354, "grad_norm": 0.9933681488037109, "learning_rate": 0.0002, "epoch": 4.982857142857143, "step": 2180}, {"eval_loss": 2.099808692932129, "eval_runtime": 111.2808, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.998857142857143, "step": 2187}, {"loss": 1.2183, "grad_norm": 0.9482853412628174, "learning_rate": 0.0002, "epoch": 5.005714285714285, "step": 2190}, {"loss": 0.9898, "grad_norm": 1.6689555644989014, "learning_rate": 0.0002, "epoch": 5.0285714285714285, "step": 2200}, {"loss": 0.9741, "grad_norm": 1.2019699811935425, "learning_rate": 0.0002, "epoch": 5.051428571428572, "step": 2210}, {"loss": 0.9737, "grad_norm": 1.535780429840088, "learning_rate": 0.0002, "epoch": 5.074285714285715, "step": 2220}, {"loss": 0.9494, "grad_norm": 1.2061309814453125, "learning_rate": 0.0002, "epoch": 5.097142857142857, "step": 2230}, {"loss": 0.9316, "grad_norm": 1.1898778676986694, "learning_rate": 0.0002, "epoch": 5.12, "step": 2240}, {"loss": 1.002, "grad_norm": 1.158898949623108, "learning_rate": 0.0002, "epoch": 5.142857142857143, "step": 2250}, {"loss": 0.9715, "grad_norm": 1.370749592781067, "learning_rate": 0.0002, "epoch": 5.1657142857142855, "step": 2260}, {"loss": 0.9365, "grad_norm": 1.314120888710022, "learning_rate": 0.0002, "epoch": 5.188571428571429, "step": 2270}, {"loss": 1.0316, "grad_norm": 1.2184966802597046, "learning_rate": 0.0002, "epoch": 5.211428571428572, "step": 2280}, {"loss": 0.9407, "grad_norm": 1.4833279848098755, "learning_rate": 0.0002, "epoch": 5.234285714285714, "step": 2290}, {"loss": 0.9635, "grad_norm": 1.3348219394683838, "learning_rate": 0.0002, "epoch": 5.257142857142857, "step": 2300}, {"loss": 1.0294, "grad_norm": 1.4166619777679443, "learning_rate": 0.0002, "epoch": 5.28, "step": 2310}, {"loss": 0.9818, "grad_norm": 1.4539530277252197, "learning_rate": 0.0002, "epoch": 5.3028571428571425, "step": 2320}, {"loss": 1.0165, "grad_norm": 1.4642518758773804, "learning_rate": 0.0002, "epoch": 5.325714285714286, "step": 2330}, {"loss": 1.0081, "grad_norm": 1.3938848972320557, "learning_rate": 0.0002, "epoch": 5.348571428571429, "step": 2340}, {"loss": 1.03, "grad_norm": 1.1147894859313965, "learning_rate": 0.0002, "epoch": 5.371428571428572, "step": 2350}, {"loss": 0.9975, "grad_norm": 1.3465309143066406, "learning_rate": 0.0002, "epoch": 5.394285714285714, "step": 2360}, {"loss": 1.0138, "grad_norm": 1.4788566827774048, "learning_rate": 0.0002, "epoch": 5.417142857142857, "step": 2370}, {"loss": 0.9896, "grad_norm": 1.3808705806732178, "learning_rate": 0.0002, "epoch": 5.44, "step": 2380}, {"loss": 1.0279, "grad_norm": 1.2336329221725464, "learning_rate": 0.0002, "epoch": 5.462857142857143, "step": 2390}, {"loss": 0.9763, "grad_norm": 1.5445678234100342, "learning_rate": 0.0002, "epoch": 5.485714285714286, "step": 2400}, {"loss": 0.9534, "grad_norm": 1.107488989830017, "learning_rate": 0.0002, "epoch": 5.508571428571429, "step": 2410}, {"loss": 1.0036, "grad_norm": 1.39687979221344, "learning_rate": 0.0002, "epoch": 5.531428571428571, "step": 2420}, {"loss": 0.9959, "grad_norm": 1.3905695676803589, "learning_rate": 0.0002, "epoch": 5.554285714285714, "step": 2430}, {"loss": 0.9912, "grad_norm": 1.3772821426391602, "learning_rate": 0.0002, "epoch": 5.577142857142857, "step": 2440}, {"loss": 0.9825, "grad_norm": 1.1661899089813232, "learning_rate": 0.0002, "epoch": 5.6, "step": 2450}, {"loss": 1.0003, "grad_norm": 1.2730463743209839, "learning_rate": 0.0002, "epoch": 5.622857142857143, "step": 2460}, {"loss": 1.0433, "grad_norm": 1.2251193523406982, "learning_rate": 0.0002, "epoch": 5.645714285714286, "step": 2470}, {"loss": 1.079, "grad_norm": 1.5454859733581543, "learning_rate": 0.0002, "epoch": 5.668571428571429, "step": 2480}, {"loss": 1.0414, "grad_norm": 1.5405735969543457, "learning_rate": 0.0002, "epoch": 5.691428571428571, "step": 2490}, {"loss": 1.0353, "grad_norm": 1.2555434703826904, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 2500}, {"loss": 1.0019, "grad_norm": 1.3323487043380737, "learning_rate": 0.0002, "epoch": 5.737142857142857, "step": 2510}, {"loss": 1.051, "grad_norm": 1.3106356859207153, "learning_rate": 0.0002, "epoch": 5.76, "step": 2520}, {"loss": 1.0248, "grad_norm": 1.4832439422607422, "learning_rate": 0.0002, "epoch": 5.782857142857143, "step": 2530}, {"loss": 1.0643, "grad_norm": 1.1336562633514404, "learning_rate": 0.0002, "epoch": 5.805714285714286, "step": 2540}, {"loss": 1.0446, "grad_norm": 1.2434223890304565, "learning_rate": 0.0002, "epoch": 5.828571428571428, "step": 2550}, {"loss": 1.0467, "grad_norm": 1.2825450897216797, "learning_rate": 0.0002, "epoch": 5.851428571428571, "step": 2560}, {"loss": 1.0642, "grad_norm": 1.4373180866241455, "learning_rate": 0.0002, "epoch": 5.8742857142857146, "step": 2570}, {"loss": 1.0814, "grad_norm": 1.435015320777893, "learning_rate": 0.0002, "epoch": 5.897142857142857, "step": 2580}, {"loss": 1.0272, "grad_norm": 1.4075653553009033, "learning_rate": 0.0002, "epoch": 5.92, "step": 2590}, {"loss": 1.0703, "grad_norm": 1.319630742073059, "learning_rate": 0.0002, "epoch": 5.942857142857143, "step": 2600}, {"loss": 1.0375, "grad_norm": 1.278330683708191, "learning_rate": 0.0002, "epoch": 5.965714285714286, "step": 2610}, {"loss": 1.0766, "grad_norm": 1.258158564567566, "learning_rate": 0.0002, "epoch": 5.988571428571428, "step": 2620}, {"eval_loss": 2.3689301013946533, "eval_runtime": 53.9067, "eval_samples_per_second": 9.405, "eval_steps_per_second": 1.187, "epoch": 6.0, "step": 2625}, {"loss": 0.9142, "grad_norm": 1.3128368854522705, "learning_rate": 0.0002, "epoch": 6.011428571428572, "step": 2630}, {"loss": 0.7716, "grad_norm": 1.4280474185943604, "learning_rate": 0.0002, "epoch": 6.034285714285715, "step": 2640}, {"loss": 0.7776, "grad_norm": 1.5061450004577637, "learning_rate": 0.0002, "epoch": 6.057142857142857, "step": 2650}, {"loss": 0.7707, "grad_norm": 1.6013342142105103, "learning_rate": 0.0002, "epoch": 6.08, "step": 2660}, {"loss": 0.7543, "grad_norm": 2.0107381343841553, "learning_rate": 0.0002, "epoch": 6.102857142857143, "step": 2670}, {"loss": 0.747, "grad_norm": 1.5010124444961548, "learning_rate": 0.0002, "epoch": 6.1257142857142854, "step": 2680}, {"loss": 0.7501, "grad_norm": 1.5222150087356567, "learning_rate": 0.0002, "epoch": 6.148571428571429, "step": 2690}, {"loss": 0.7712, "grad_norm": 1.5413103103637695, "learning_rate": 0.0002, "epoch": 6.171428571428572, "step": 2700}, {"loss": 0.7, "grad_norm": 1.527140736579895, "learning_rate": 0.0002, "epoch": 6.194285714285714, "step": 2710}, {"loss": 0.7539, "grad_norm": 1.9386590719223022, "learning_rate": 0.0002, "epoch": 6.217142857142857, "step": 2720}, {"loss": 0.7586, "grad_norm": 1.8115214109420776, "learning_rate": 0.0002, "epoch": 6.24, "step": 2730}, {"loss": 0.7426, "grad_norm": 1.6221802234649658, "learning_rate": 0.0002, "epoch": 6.2628571428571425, "step": 2740}, {"loss": 0.8002, "grad_norm": 1.6698768138885498, "learning_rate": 0.0002, "epoch": 6.285714285714286, "step": 2750}, {"loss": 0.7293, "grad_norm": 1.7960610389709473, "learning_rate": 0.0002, "epoch": 6.308571428571429, "step": 2760}, {"loss": 0.7405, "grad_norm": 1.32172429561615, "learning_rate": 0.0002, "epoch": 6.331428571428571, "step": 2770}, {"loss": 0.7198, "grad_norm": 1.7468090057373047, "learning_rate": 0.0002, "epoch": 6.354285714285714, "step": 2780}, {"loss": 0.76, "grad_norm": 1.6777397394180298, "learning_rate": 0.0002, "epoch": 6.377142857142857, "step": 2790}, {"loss": 0.7879, "grad_norm": 1.6200671195983887, "learning_rate": 0.0002, "epoch": 6.4, "step": 2800}, {"loss": 0.7807, "grad_norm": 1.723505973815918, "learning_rate": 0.0002, "epoch": 6.422857142857143, "step": 2810}, {"loss": 0.8645, "grad_norm": 1.4945589303970337, "learning_rate": 0.0002, "epoch": 6.445714285714286, "step": 2820}, {"loss": 0.809, "grad_norm": 1.666458010673523, "learning_rate": 0.0002, "epoch": 6.468571428571429, "step": 2830}, {"loss": 0.7996, "grad_norm": 1.6586525440216064, "learning_rate": 0.0002, "epoch": 6.491428571428571, "step": 2840}, {"loss": 0.8062, "grad_norm": 1.7480043172836304, "learning_rate": 0.0002, "epoch": 6.514285714285714, "step": 2850}, {"loss": 0.7602, "grad_norm": 1.4605649709701538, "learning_rate": 0.0002, "epoch": 6.537142857142857, "step": 2860}, {"loss": 0.8186, "grad_norm": 1.4841814041137695, "learning_rate": 0.0002, "epoch": 6.5600000000000005, "step": 2870}, {"loss": 0.8156, "grad_norm": 1.4653114080429077, "learning_rate": 0.0002, "epoch": 6.582857142857143, "step": 2880}, {"loss": 0.8111, "grad_norm": 1.7266837358474731, "learning_rate": 0.0002, "epoch": 6.605714285714286, "step": 2890}, {"loss": 0.7644, "grad_norm": 1.4860098361968994, "learning_rate": 0.0002, "epoch": 6.628571428571428, "step": 2900}, {"loss": 0.7991, "grad_norm": 1.7177597284317017, "learning_rate": 0.0002, "epoch": 6.651428571428571, "step": 2910}, {"loss": 0.7883, "grad_norm": 1.6757104396820068, "learning_rate": 0.0002, "epoch": 6.674285714285714, "step": 2920}, {"loss": 0.8598, "grad_norm": 1.5177433490753174, "learning_rate": 0.0002, "epoch": 6.6971428571428575, "step": 2930}, {"loss": 0.7825, "grad_norm": 1.8073889017105103, "learning_rate": 0.0002, "epoch": 6.72, "step": 2940}, {"loss": 0.8234, "grad_norm": 1.72337007522583, "learning_rate": 0.0002, "epoch": 6.742857142857143, "step": 2950}, {"loss": 0.896, "grad_norm": 1.6298240423202515, "learning_rate": 0.0002, "epoch": 6.765714285714286, "step": 2960}, {"loss": 0.8252, "grad_norm": 1.6140344142913818, "learning_rate": 0.0002, "epoch": 6.788571428571428, "step": 2970}, {"loss": 0.8314, "grad_norm": 1.7180862426757812, "learning_rate": 0.0002, "epoch": 6.811428571428571, "step": 2980}, {"loss": 0.7929, "grad_norm": 1.7589894533157349, "learning_rate": 0.0002, "epoch": 6.8342857142857145, "step": 2990}, {"loss": 0.828, "grad_norm": 1.780195713043213, "learning_rate": 0.0002, "epoch": 6.857142857142857, "step": 3000}, {"loss": 0.8943, "grad_norm": 1.7182508707046509, "learning_rate": 0.0002, "epoch": 6.88, "step": 3010}, {"loss": 0.7964, "grad_norm": 1.6308406591415405, "learning_rate": 0.0002, "epoch": 6.902857142857143, "step": 3020}, {"loss": 0.8207, "grad_norm": 1.5080229043960571, "learning_rate": 0.0002, "epoch": 6.925714285714285, "step": 3030}, {"loss": 0.886, "grad_norm": 1.623555064201355, "learning_rate": 0.0002, "epoch": 6.948571428571428, "step": 3040}, {"loss": 0.8377, "grad_norm": 1.526054859161377, "learning_rate": 0.0002, "epoch": 6.9714285714285715, "step": 3050}, {"loss": 0.8816, "grad_norm": 1.6671174764633179, "learning_rate": 0.0002, "epoch": 6.994285714285715, "step": 3060}]} +{"epoch": 0.9988571428571429, "step": 437, "epoch_duration": 1529.840027332306, "total_accumulated_duration": 1529.840027332306, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0606, "grad_norm": 0.7581241726875305, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5819, "grad_norm": 0.528847873210907, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1908, "grad_norm": 0.656889796257019, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1226, "grad_norm": 0.4785534143447876, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.9471, "grad_norm": 0.4260832369327545, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.953, "grad_norm": 0.43787336349487305, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9426, "grad_norm": 0.48819172382354736, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9007, "grad_norm": 0.45916950702667236, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9302, "grad_norm": 0.4116848409175873, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8983, "grad_norm": 1.530875325202942, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9179, "grad_norm": 0.3546597957611084, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8483, "grad_norm": 0.4091620147228241, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7997, "grad_norm": 0.3846702575683594, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.887, "grad_norm": 0.3834623396396637, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.833, "grad_norm": 0.3492090106010437, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.8257, "grad_norm": 0.3689991235733032, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7556, "grad_norm": 0.34498894214630127, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8301, "grad_norm": 0.3145627975463867, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8202, "grad_norm": 0.3054567277431488, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.8217, "grad_norm": 0.3581295907497406, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8552, "grad_norm": 0.35844793915748596, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8673, "grad_norm": 0.3790668547153473, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8689, "grad_norm": 0.36423084139823914, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.853, "grad_norm": 0.32898446917533875, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8766, "grad_norm": 0.35387709736824036, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7915, "grad_norm": 0.31785526871681213, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.8075, "grad_norm": 0.3718429505825043, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.823, "grad_norm": 0.30403199791908264, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8317, "grad_norm": 0.31306174397468567, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.775, "grad_norm": 0.29847824573516846, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8457, "grad_norm": 0.33621150255203247, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8111, "grad_norm": 0.2977754771709442, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7753, "grad_norm": 0.31184402108192444, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8029, "grad_norm": 0.3459164500236511, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7692, "grad_norm": 0.34528955817222595, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.7278, "grad_norm": 0.36549675464630127, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8474, "grad_norm": 0.370948851108551, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.88, "grad_norm": 0.30869928002357483, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8072, "grad_norm": 0.3989962637424469, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7523, "grad_norm": 0.3082427680492401, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7629, "grad_norm": 0.33315128087997437, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8118, "grad_norm": 0.32147616147994995, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7694, "grad_norm": 0.30213138461112976, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}]} +{"epoch": 7.990857142857143, "step": 3496, "epoch_duration": 1356.2125453948975, "total_accumulated_duration": 10769.926096916199, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-42/checkpoint-875", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.066, "grad_norm": 0.6273946762084961, "learning_rate": 0.0002, "epoch": 0.022857142857142857, "step": 10}, {"loss": 2.5855, "grad_norm": 0.5300710201263428, "learning_rate": 0.0002, "epoch": 0.045714285714285714, "step": 20}, {"loss": 2.1887, "grad_norm": 0.6162196397781372, "learning_rate": 0.0002, "epoch": 0.06857142857142857, "step": 30}, {"loss": 2.1164, "grad_norm": 0.5143047571182251, "learning_rate": 0.0002, "epoch": 0.09142857142857143, "step": 40}, {"loss": 1.943, "grad_norm": 0.4000673294067383, "learning_rate": 0.0002, "epoch": 0.11428571428571428, "step": 50}, {"loss": 1.9531, "grad_norm": 0.444892555475235, "learning_rate": 0.0002, "epoch": 0.13714285714285715, "step": 60}, {"loss": 1.9435, "grad_norm": 0.4871707558631897, "learning_rate": 0.0002, "epoch": 0.16, "step": 70}, {"loss": 1.9072, "grad_norm": 0.451060026884079, "learning_rate": 0.0002, "epoch": 0.18285714285714286, "step": 80}, {"loss": 1.9312, "grad_norm": 0.3939569592475891, "learning_rate": 0.0002, "epoch": 0.2057142857142857, "step": 90}, {"loss": 1.8982, "grad_norm": 0.5033721923828125, "learning_rate": 0.0002, "epoch": 0.22857142857142856, "step": 100}, {"loss": 1.9148, "grad_norm": 0.3636534512042999, "learning_rate": 0.0002, "epoch": 0.25142857142857145, "step": 110}, {"loss": 1.8462, "grad_norm": 0.4391206204891205, "learning_rate": 0.0002, "epoch": 0.2742857142857143, "step": 120}, {"loss": 1.7998, "grad_norm": 0.5243169665336609, "learning_rate": 0.0002, "epoch": 0.29714285714285715, "step": 130}, {"loss": 1.8875, "grad_norm": 0.4055655598640442, "learning_rate": 0.0002, "epoch": 0.32, "step": 140}, {"loss": 1.8348, "grad_norm": 0.39735132455825806, "learning_rate": 0.0002, "epoch": 0.34285714285714286, "step": 150}, {"loss": 1.824, "grad_norm": 0.4696349501609802, "learning_rate": 0.0002, "epoch": 0.3657142857142857, "step": 160}, {"loss": 1.7566, "grad_norm": 0.3987901508808136, "learning_rate": 0.0002, "epoch": 0.38857142857142857, "step": 170}, {"loss": 1.8288, "grad_norm": 0.32404327392578125, "learning_rate": 0.0002, "epoch": 0.4114285714285714, "step": 180}, {"loss": 1.8178, "grad_norm": 0.3692261576652527, "learning_rate": 0.0002, "epoch": 0.4342857142857143, "step": 190}, {"loss": 1.823, "grad_norm": 0.37267744541168213, "learning_rate": 0.0002, "epoch": 0.45714285714285713, "step": 200}, {"loss": 1.8564, "grad_norm": 0.3559934198856354, "learning_rate": 0.0002, "epoch": 0.48, "step": 210}, {"loss": 1.8651, "grad_norm": 0.3374815285205841, "learning_rate": 0.0002, "epoch": 0.5028571428571429, "step": 220}, {"loss": 1.8683, "grad_norm": 0.34598177671432495, "learning_rate": 0.0002, "epoch": 0.5257142857142857, "step": 230}, {"loss": 1.8554, "grad_norm": 0.35629919171333313, "learning_rate": 0.0002, "epoch": 0.5485714285714286, "step": 240}, {"loss": 1.8751, "grad_norm": 0.3586862087249756, "learning_rate": 0.0002, "epoch": 0.5714285714285714, "step": 250}, {"loss": 1.7942, "grad_norm": 0.3198927342891693, "learning_rate": 0.0002, "epoch": 0.5942857142857143, "step": 260}, {"loss": 1.81, "grad_norm": 0.37690025568008423, "learning_rate": 0.0002, "epoch": 0.6171428571428571, "step": 270}, {"loss": 1.8258, "grad_norm": 0.2855667471885681, "learning_rate": 0.0002, "epoch": 0.64, "step": 280}, {"loss": 1.8288, "grad_norm": 0.3242695927619934, "learning_rate": 0.0002, "epoch": 0.6628571428571428, "step": 290}, {"loss": 1.7738, "grad_norm": 0.2960120141506195, "learning_rate": 0.0002, "epoch": 0.6857142857142857, "step": 300}, {"loss": 1.8443, "grad_norm": 0.3596384823322296, "learning_rate": 0.0002, "epoch": 0.7085714285714285, "step": 310}, {"loss": 1.8132, "grad_norm": 0.3001834750175476, "learning_rate": 0.0002, "epoch": 0.7314285714285714, "step": 320}, {"loss": 1.7751, "grad_norm": 0.31361159682273865, "learning_rate": 0.0002, "epoch": 0.7542857142857143, "step": 330}, {"loss": 1.8021, "grad_norm": 0.34093308448791504, "learning_rate": 0.0002, "epoch": 0.7771428571428571, "step": 340}, {"loss": 1.7674, "grad_norm": 0.3383876085281372, "learning_rate": 0.0002, "epoch": 0.8, "step": 350}, {"loss": 1.729, "grad_norm": 0.35100996494293213, "learning_rate": 0.0002, "epoch": 0.8228571428571428, "step": 360}, {"loss": 1.8517, "grad_norm": 0.344976007938385, "learning_rate": 0.0002, "epoch": 0.8457142857142858, "step": 370}, {"loss": 1.8779, "grad_norm": 0.3119729459285736, "learning_rate": 0.0002, "epoch": 0.8685714285714285, "step": 380}, {"loss": 1.8043, "grad_norm": 0.349221795797348, "learning_rate": 0.0002, "epoch": 0.8914285714285715, "step": 390}, {"loss": 1.7529, "grad_norm": 0.3124293386936188, "learning_rate": 0.0002, "epoch": 0.9142857142857143, "step": 400}, {"loss": 1.7611, "grad_norm": 0.35504350066185, "learning_rate": 0.0002, "epoch": 0.9371428571428572, "step": 410}, {"loss": 1.8115, "grad_norm": 0.310310959815979, "learning_rate": 0.0002, "epoch": 0.96, "step": 420}, {"loss": 1.7666, "grad_norm": 0.30432847142219543, "learning_rate": 0.0002, "epoch": 0.9828571428571429, "step": 430}, {"eval_loss": 1.8310153484344482, "eval_runtime": 111.7814, "eval_samples_per_second": 4.536, "eval_steps_per_second": 0.573, "epoch": 0.9988571428571429, "step": 437}, {"loss": 1.7435, "grad_norm": 0.3121616840362549, "learning_rate": 0.0002, "epoch": 1.0057142857142858, "step": 440}, {"loss": 1.7404, "grad_norm": 0.3365118205547333, "learning_rate": 0.0002, "epoch": 1.0285714285714285, "step": 450}, {"loss": 1.7901, "grad_norm": 0.3626686930656433, "learning_rate": 0.0002, "epoch": 1.0514285714285714, "step": 460}, {"loss": 1.721, "grad_norm": 0.30539533495903015, "learning_rate": 0.0002, "epoch": 1.0742857142857143, "step": 470}, {"loss": 1.7354, "grad_norm": 0.3159816861152649, "learning_rate": 0.0002, "epoch": 1.0971428571428572, "step": 480}, {"loss": 1.7471, "grad_norm": 0.3695855736732483, "learning_rate": 0.0002, "epoch": 1.12, "step": 490}, {"loss": 1.7626, "grad_norm": 0.3609161674976349, "learning_rate": 0.0002, "epoch": 1.1428571428571428, "step": 500}, {"loss": 1.8723, "grad_norm": 0.3683869242668152, "learning_rate": 0.0002, "epoch": 1.1657142857142857, "step": 510}, {"loss": 1.7102, "grad_norm": 0.3862539529800415, "learning_rate": 0.0002, "epoch": 1.1885714285714286, "step": 520}, {"loss": 1.7589, "grad_norm": 0.4244740307331085, "learning_rate": 0.0002, "epoch": 1.2114285714285715, "step": 530}, {"loss": 1.717, "grad_norm": 0.373703271150589, "learning_rate": 0.0002, "epoch": 1.2342857142857142, "step": 540}, {"loss": 1.795, "grad_norm": 0.35715773701667786, "learning_rate": 0.0002, "epoch": 1.2571428571428571, "step": 550}, {"loss": 1.7578, "grad_norm": 0.3555964231491089, "learning_rate": 0.0002, "epoch": 1.28, "step": 560}, {"loss": 1.7228, "grad_norm": 0.35080263018608093, "learning_rate": 0.0002, "epoch": 1.302857142857143, "step": 570}, {"loss": 1.6808, "grad_norm": 0.3589482307434082, "learning_rate": 0.0002, "epoch": 1.3257142857142856, "step": 580}, {"loss": 1.7369, "grad_norm": 0.3711223900318146, "learning_rate": 0.0002, "epoch": 1.3485714285714285, "step": 590}, {"loss": 1.7417, "grad_norm": 0.313614159822464, "learning_rate": 0.0002, "epoch": 1.3714285714285714, "step": 600}, {"loss": 1.7191, "grad_norm": 0.3842357397079468, "learning_rate": 0.0002, "epoch": 1.3942857142857144, "step": 610}, {"loss": 1.6737, "grad_norm": 0.36126819252967834, "learning_rate": 0.0002, "epoch": 1.4171428571428573, "step": 620}, {"loss": 1.7073, "grad_norm": 0.35922661423683167, "learning_rate": 0.0002, "epoch": 1.44, "step": 630}, {"loss": 1.6708, "grad_norm": 0.3922875225543976, "learning_rate": 0.0002, "epoch": 1.4628571428571429, "step": 640}, {"loss": 1.7544, "grad_norm": 0.365546852350235, "learning_rate": 0.0002, "epoch": 1.4857142857142858, "step": 650}, {"loss": 1.674, "grad_norm": 0.36107590794563293, "learning_rate": 0.0002, "epoch": 1.5085714285714285, "step": 660}, {"loss": 1.6518, "grad_norm": 0.3307042121887207, "learning_rate": 0.0002, "epoch": 1.5314285714285716, "step": 670}, {"loss": 1.7306, "grad_norm": 0.3492133915424347, "learning_rate": 0.0002, "epoch": 1.5542857142857143, "step": 680}, {"loss": 1.812, "grad_norm": 0.38608574867248535, "learning_rate": 0.0002, "epoch": 1.5771428571428572, "step": 690}, {"loss": 1.7042, "grad_norm": 0.3489173650741577, "learning_rate": 0.0002, "epoch": 1.6, "step": 700}, {"loss": 1.7306, "grad_norm": 0.36614152789115906, "learning_rate": 0.0002, "epoch": 1.6228571428571428, "step": 710}, {"loss": 1.7281, "grad_norm": 0.34340205788612366, "learning_rate": 0.0002, "epoch": 1.6457142857142857, "step": 720}, {"loss": 1.7607, "grad_norm": 0.34590771794319153, "learning_rate": 0.0002, "epoch": 1.6685714285714286, "step": 730}, {"loss": 1.7108, "grad_norm": 0.3759954273700714, "learning_rate": 0.0002, "epoch": 1.6914285714285713, "step": 740}, {"loss": 1.6903, "grad_norm": 0.3753475546836853, "learning_rate": 0.0002, "epoch": 1.7142857142857144, "step": 750}, {"loss": 1.7054, "grad_norm": 0.38416001200675964, "learning_rate": 0.0002, "epoch": 1.737142857142857, "step": 760}, {"loss": 1.7125, "grad_norm": 0.36223554611206055, "learning_rate": 0.0002, "epoch": 1.76, "step": 770}, {"loss": 1.6878, "grad_norm": 0.329556941986084, "learning_rate": 0.0002, "epoch": 1.782857142857143, "step": 780}, {"loss": 1.803, "grad_norm": 0.34008052945137024, "learning_rate": 0.0002, "epoch": 1.8057142857142856, "step": 790}, {"loss": 1.6545, "grad_norm": 0.40297919511795044, "learning_rate": 0.0002, "epoch": 1.8285714285714287, "step": 800}, {"loss": 1.7518, "grad_norm": 0.35378390550613403, "learning_rate": 0.0002, "epoch": 1.8514285714285714, "step": 810}, {"loss": 1.7063, "grad_norm": 0.3625478148460388, "learning_rate": 0.0002, "epoch": 1.8742857142857143, "step": 820}, {"loss": 1.6339, "grad_norm": 0.36153221130371094, "learning_rate": 0.0002, "epoch": 1.8971428571428572, "step": 830}, {"loss": 1.7434, "grad_norm": 0.3612948954105377, "learning_rate": 0.0002, "epoch": 1.92, "step": 840}, {"loss": 1.7242, "grad_norm": 0.399213045835495, "learning_rate": 0.0002, "epoch": 1.9428571428571428, "step": 850}, {"loss": 1.6909, "grad_norm": 0.40026402473449707, "learning_rate": 0.0002, "epoch": 1.9657142857142857, "step": 860}, {"loss": 1.76, "grad_norm": 0.38114118576049805, "learning_rate": 0.0002, "epoch": 1.9885714285714284, "step": 870}, {"eval_loss": 1.8279441595077515, "eval_runtime": 111.1652, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 875}, {"loss": 1.7215, "grad_norm": 0.33838793635368347, "learning_rate": 0.0002, "epoch": 2.0114285714285716, "step": 880}, {"loss": 1.6105, "grad_norm": 0.4219334125518799, "learning_rate": 0.0002, "epoch": 2.0342857142857143, "step": 890}, {"loss": 1.6144, "grad_norm": 0.43962377309799194, "learning_rate": 0.0002, "epoch": 2.057142857142857, "step": 900}, {"loss": 1.5821, "grad_norm": 0.41956576704978943, "learning_rate": 0.0002, "epoch": 2.08, "step": 910}, {"loss": 1.5812, "grad_norm": 0.4439629912376404, "learning_rate": 0.0002, "epoch": 2.1028571428571428, "step": 920}, {"loss": 1.5923, "grad_norm": 0.43405696749687195, "learning_rate": 0.0002, "epoch": 2.125714285714286, "step": 930}, {"loss": 1.5968, "grad_norm": 0.4321737587451935, "learning_rate": 0.0002, "epoch": 2.1485714285714286, "step": 940}, {"loss": 1.5683, "grad_norm": 0.4689100682735443, "learning_rate": 0.0002, "epoch": 2.1714285714285713, "step": 950}, {"loss": 1.6442, "grad_norm": 0.47024697065353394, "learning_rate": 0.0002, "epoch": 2.1942857142857144, "step": 960}, {"loss": 1.4703, "grad_norm": 0.4535103440284729, "learning_rate": 0.0002, "epoch": 2.217142857142857, "step": 970}, {"loss": 1.6378, "grad_norm": 0.45990121364593506, "learning_rate": 0.0002, "epoch": 2.24, "step": 980}, {"loss": 1.627, "grad_norm": 0.48427215218544006, "learning_rate": 0.0002, "epoch": 2.262857142857143, "step": 990}, {"loss": 1.6316, "grad_norm": 0.43076643347740173, "learning_rate": 0.0002, "epoch": 2.2857142857142856, "step": 1000}, {"loss": 1.5938, "grad_norm": 0.4854483902454376, "learning_rate": 0.0002, "epoch": 2.3085714285714287, "step": 1010}, {"loss": 1.6223, "grad_norm": 0.46086496114730835, "learning_rate": 0.0002, "epoch": 2.3314285714285714, "step": 1020}, {"loss": 1.6392, "grad_norm": 0.4714847505092621, "learning_rate": 0.0002, "epoch": 2.354285714285714, "step": 1030}, {"loss": 1.5785, "grad_norm": 0.4423409402370453, "learning_rate": 0.0002, "epoch": 2.3771428571428572, "step": 1040}, {"loss": 1.5821, "grad_norm": 0.46261295676231384, "learning_rate": 0.0002, "epoch": 2.4, "step": 1050}, {"loss": 1.5503, "grad_norm": 0.4914337396621704, "learning_rate": 0.0002, "epoch": 2.422857142857143, "step": 1060}, {"loss": 1.6459, "grad_norm": 0.45144036412239075, "learning_rate": 0.0002, "epoch": 2.4457142857142857, "step": 1070}, {"loss": 1.6416, "grad_norm": 0.4510825276374817, "learning_rate": 0.0002, "epoch": 2.4685714285714284, "step": 1080}, {"loss": 1.5808, "grad_norm": 0.48552489280700684, "learning_rate": 0.0002, "epoch": 2.4914285714285715, "step": 1090}, {"loss": 1.6659, "grad_norm": 0.4768163859844208, "learning_rate": 0.0002, "epoch": 2.5142857142857142, "step": 1100}, {"loss": 1.6251, "grad_norm": 0.5192609429359436, "learning_rate": 0.0002, "epoch": 2.5371428571428574, "step": 1110}, {"loss": 1.5581, "grad_norm": 0.49308598041534424, "learning_rate": 0.0002, "epoch": 2.56, "step": 1120}, {"loss": 1.6767, "grad_norm": 0.5068584084510803, "learning_rate": 0.0002, "epoch": 2.5828571428571427, "step": 1130}, {"loss": 1.5788, "grad_norm": 0.4822661280632019, "learning_rate": 0.0002, "epoch": 2.605714285714286, "step": 1140}, {"loss": 1.6891, "grad_norm": 0.5028144717216492, "learning_rate": 0.0002, "epoch": 2.6285714285714286, "step": 1150}, {"loss": 1.6782, "grad_norm": 0.48315200209617615, "learning_rate": 0.0002, "epoch": 2.6514285714285712, "step": 1160}, {"loss": 1.5508, "grad_norm": 0.551934540271759, "learning_rate": 0.0002, "epoch": 2.6742857142857144, "step": 1170}, {"loss": 1.5448, "grad_norm": 0.49223729968070984, "learning_rate": 0.0002, "epoch": 2.697142857142857, "step": 1180}, {"loss": 1.6382, "grad_norm": 0.514847457408905, "learning_rate": 0.0002, "epoch": 2.7199999999999998, "step": 1190}, {"loss": 1.5054, "grad_norm": 0.4830605387687683, "learning_rate": 0.0002, "epoch": 2.742857142857143, "step": 1200}, {"loss": 1.6285, "grad_norm": 0.4584822952747345, "learning_rate": 0.0002, "epoch": 2.7657142857142856, "step": 1210}, {"loss": 1.6043, "grad_norm": 0.4688762426376343, "learning_rate": 0.0002, "epoch": 2.7885714285714287, "step": 1220}, {"loss": 1.5555, "grad_norm": 0.4488156735897064, "learning_rate": 0.0002, "epoch": 2.8114285714285714, "step": 1230}, {"loss": 1.5523, "grad_norm": 0.4700278639793396, "learning_rate": 0.0002, "epoch": 2.8342857142857145, "step": 1240}, {"loss": 1.642, "grad_norm": 0.5282207131385803, "learning_rate": 0.0002, "epoch": 2.857142857142857, "step": 1250}, {"loss": 1.5532, "grad_norm": 0.4874219000339508, "learning_rate": 0.0002, "epoch": 2.88, "step": 1260}, {"loss": 1.6051, "grad_norm": 0.49468332529067993, "learning_rate": 0.0002, "epoch": 2.902857142857143, "step": 1270}, {"loss": 1.6006, "grad_norm": 0.49770233035087585, "learning_rate": 0.0002, "epoch": 2.9257142857142857, "step": 1280}, {"loss": 1.606, "grad_norm": 0.4433252811431885, "learning_rate": 0.0002, "epoch": 2.9485714285714284, "step": 1290}, {"loss": 1.608, "grad_norm": 0.46836379170417786, "learning_rate": 0.0002, "epoch": 2.9714285714285715, "step": 1300}, {"loss": 1.5555, "grad_norm": 0.5001904368400574, "learning_rate": 0.0002, "epoch": 2.994285714285714, "step": 1310}, {"eval_loss": 1.8684407472610474, "eval_runtime": 111.2835, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 2.998857142857143, "step": 1312}, {"loss": 1.4486, "grad_norm": 0.5184893012046814, "learning_rate": 0.0002, "epoch": 3.0171428571428573, "step": 1320}, {"loss": 1.4082, "grad_norm": 0.5665355920791626, "learning_rate": 0.0002, "epoch": 3.04, "step": 1330}, {"loss": 1.3741, "grad_norm": 0.6601403951644897, "learning_rate": 0.0002, "epoch": 3.0628571428571427, "step": 1340}, {"loss": 1.433, "grad_norm": 0.6921621561050415, "learning_rate": 0.0002, "epoch": 3.085714285714286, "step": 1350}, {"loss": 1.4562, "grad_norm": 0.6406348943710327, "learning_rate": 0.0002, "epoch": 3.1085714285714285, "step": 1360}, {"loss": 1.3563, "grad_norm": 0.5814554691314697, "learning_rate": 0.0002, "epoch": 3.1314285714285712, "step": 1370}, {"loss": 1.4096, "grad_norm": 0.683325469493866, "learning_rate": 0.0002, "epoch": 3.1542857142857144, "step": 1380}, {"loss": 1.4106, "grad_norm": 0.6686155200004578, "learning_rate": 0.0002, "epoch": 3.177142857142857, "step": 1390}, {"loss": 1.4394, "grad_norm": 0.8159713745117188, "learning_rate": 0.0002, "epoch": 3.2, "step": 1400}, {"loss": 1.4279, "grad_norm": 0.646216094493866, "learning_rate": 0.0002, "epoch": 3.222857142857143, "step": 1410}, {"loss": 1.4232, "grad_norm": 0.7323529720306396, "learning_rate": 0.0002, "epoch": 3.2457142857142856, "step": 1420}, {"loss": 1.3891, "grad_norm": 0.689349353313446, "learning_rate": 0.0002, "epoch": 3.2685714285714287, "step": 1430}, {"loss": 1.4578, "grad_norm": 0.727894127368927, "learning_rate": 0.0002, "epoch": 3.2914285714285714, "step": 1440}, {"loss": 1.4, "grad_norm": 0.6921590566635132, "learning_rate": 0.0002, "epoch": 3.314285714285714, "step": 1450}, {"loss": 1.4272, "grad_norm": 0.6176243424415588, "learning_rate": 0.0002, "epoch": 3.337142857142857, "step": 1460}, {"loss": 1.4323, "grad_norm": 0.9006354212760925, "learning_rate": 0.0002, "epoch": 3.36, "step": 1470}, {"loss": 1.4353, "grad_norm": 0.8145929574966431, "learning_rate": 0.0002, "epoch": 3.382857142857143, "step": 1480}, {"loss": 1.3859, "grad_norm": 0.6640016436576843, "learning_rate": 0.0002, "epoch": 3.4057142857142857, "step": 1490}, {"loss": 1.387, "grad_norm": 0.7266780138015747, "learning_rate": 0.0002, "epoch": 3.4285714285714284, "step": 1500}, {"loss": 1.4108, "grad_norm": 0.9351356029510498, "learning_rate": 0.0002, "epoch": 3.4514285714285715, "step": 1510}, {"loss": 1.4656, "grad_norm": 0.675645649433136, "learning_rate": 0.0002, "epoch": 3.474285714285714, "step": 1520}, {"loss": 1.384, "grad_norm": 0.761472225189209, "learning_rate": 0.0002, "epoch": 3.4971428571428573, "step": 1530}, {"loss": 1.4968, "grad_norm": 0.6653069257736206, "learning_rate": 0.0002, "epoch": 3.52, "step": 1540}, {"loss": 1.4686, "grad_norm": 0.667412519454956, "learning_rate": 0.0002, "epoch": 3.5428571428571427, "step": 1550}, {"loss": 1.4241, "grad_norm": 0.6395593881607056, "learning_rate": 0.0002, "epoch": 3.565714285714286, "step": 1560}, {"loss": 1.4825, "grad_norm": 0.7588621377944946, "learning_rate": 0.0002, "epoch": 3.5885714285714285, "step": 1570}, {"loss": 1.4459, "grad_norm": 0.6206456422805786, "learning_rate": 0.0002, "epoch": 3.611428571428571, "step": 1580}, {"loss": 1.436, "grad_norm": 0.7591291666030884, "learning_rate": 0.0002, "epoch": 3.6342857142857143, "step": 1590}, {"loss": 1.458, "grad_norm": 0.6476313471794128, "learning_rate": 0.0002, "epoch": 3.657142857142857, "step": 1600}, {"loss": 1.4598, "grad_norm": 0.6731392741203308, "learning_rate": 0.0002, "epoch": 3.68, "step": 1610}, {"loss": 1.4225, "grad_norm": 0.725190281867981, "learning_rate": 0.0002, "epoch": 3.702857142857143, "step": 1620}, {"loss": 1.4525, "grad_norm": 0.6720049977302551, "learning_rate": 0.0002, "epoch": 3.725714285714286, "step": 1630}, {"loss": 1.429, "grad_norm": 0.6301007270812988, "learning_rate": 0.0002, "epoch": 3.7485714285714287, "step": 1640}, {"loss": 1.4166, "grad_norm": 0.715893566608429, "learning_rate": 0.0002, "epoch": 3.7714285714285714, "step": 1650}, {"loss": 1.3624, "grad_norm": 0.7539359927177429, "learning_rate": 0.0002, "epoch": 3.7942857142857145, "step": 1660}, {"loss": 1.4516, "grad_norm": 0.6658543348312378, "learning_rate": 0.0002, "epoch": 3.817142857142857, "step": 1670}, {"loss": 1.3934, "grad_norm": 0.7019526958465576, "learning_rate": 0.0002, "epoch": 3.84, "step": 1680}, {"loss": 1.4436, "grad_norm": 0.6517802476882935, "learning_rate": 0.0002, "epoch": 3.862857142857143, "step": 1690}, {"loss": 1.4968, "grad_norm": 0.7617332935333252, "learning_rate": 0.0002, "epoch": 3.8857142857142857, "step": 1700}, {"loss": 1.5145, "grad_norm": 0.6919480562210083, "learning_rate": 0.0002, "epoch": 3.9085714285714284, "step": 1710}, {"loss": 1.4317, "grad_norm": 0.6987943053245544, "learning_rate": 0.0002, "epoch": 3.9314285714285715, "step": 1720}, {"loss": 1.4704, "grad_norm": 0.7062228918075562, "learning_rate": 0.0002, "epoch": 3.954285714285714, "step": 1730}, {"loss": 1.4219, "grad_norm": 0.6769542098045349, "learning_rate": 0.0002, "epoch": 3.977142857142857, "step": 1740}, {"loss": 1.4998, "grad_norm": 0.6832144260406494, "learning_rate": 0.0002, "epoch": 4.0, "step": 1750}, {"eval_loss": 1.9474865198135376, "eval_runtime": 111.288, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 1750}, {"loss": 1.2251, "grad_norm": 1.064110279083252, "learning_rate": 0.0002, "epoch": 4.022857142857143, "step": 1760}, {"loss": 1.2013, "grad_norm": 0.8380683660507202, "learning_rate": 0.0002, "epoch": 4.045714285714285, "step": 1770}, {"loss": 1.2416, "grad_norm": 1.1863020658493042, "learning_rate": 0.0002, "epoch": 4.0685714285714285, "step": 1780}, {"loss": 1.2499, "grad_norm": 1.0128898620605469, "learning_rate": 0.0002, "epoch": 4.091428571428572, "step": 1790}, {"loss": 1.2043, "grad_norm": 0.9221312403678894, "learning_rate": 0.0002, "epoch": 4.114285714285714, "step": 1800}, {"loss": 1.181, "grad_norm": 1.1298727989196777, "learning_rate": 0.0002, "epoch": 4.137142857142857, "step": 1810}, {"loss": 1.1491, "grad_norm": 0.8854547739028931, "learning_rate": 0.0002, "epoch": 4.16, "step": 1820}, {"loss": 1.2156, "grad_norm": 0.8920808434486389, "learning_rate": 0.0002, "epoch": 4.182857142857143, "step": 1830}, {"loss": 1.1969, "grad_norm": 0.913244366645813, "learning_rate": 0.0002, "epoch": 4.2057142857142855, "step": 1840}, {"loss": 1.2156, "grad_norm": 0.908831000328064, "learning_rate": 0.0002, "epoch": 4.228571428571429, "step": 1850}, {"loss": 1.1653, "grad_norm": 1.0223685503005981, "learning_rate": 0.0002, "epoch": 4.251428571428572, "step": 1860}, {"loss": 1.2497, "grad_norm": 0.9771921634674072, "learning_rate": 0.0002, "epoch": 4.274285714285714, "step": 1870}, {"loss": 1.213, "grad_norm": 0.9313384890556335, "learning_rate": 0.0002, "epoch": 4.297142857142857, "step": 1880}, {"loss": 1.1723, "grad_norm": 1.0754257440567017, "learning_rate": 0.0002, "epoch": 4.32, "step": 1890}, {"loss": 1.2286, "grad_norm": 0.8904672265052795, "learning_rate": 0.0002, "epoch": 4.3428571428571425, "step": 1900}, {"loss": 1.2618, "grad_norm": 1.046527624130249, "learning_rate": 0.0002, "epoch": 4.365714285714286, "step": 1910}, {"loss": 1.2368, "grad_norm": 0.9576982855796814, "learning_rate": 0.0002, "epoch": 4.388571428571429, "step": 1920}, {"loss": 1.211, "grad_norm": 0.9278356432914734, "learning_rate": 0.0002, "epoch": 4.411428571428571, "step": 1930}, {"loss": 1.2005, "grad_norm": 1.1763030290603638, "learning_rate": 0.0002, "epoch": 4.434285714285714, "step": 1940}, {"loss": 1.1541, "grad_norm": 0.9183000326156616, "learning_rate": 0.0002, "epoch": 4.457142857142857, "step": 1950}, {"loss": 1.2257, "grad_norm": 1.050980806350708, "learning_rate": 0.0002, "epoch": 4.48, "step": 1960}, {"loss": 1.2133, "grad_norm": 0.9975392818450928, "learning_rate": 0.0002, "epoch": 4.502857142857143, "step": 1970}, {"loss": 1.2312, "grad_norm": 0.990544319152832, "learning_rate": 0.0002, "epoch": 4.525714285714286, "step": 1980}, {"loss": 1.2465, "grad_norm": 1.004794955253601, "learning_rate": 0.0002, "epoch": 4.548571428571429, "step": 1990}, {"loss": 1.2085, "grad_norm": 0.9294857978820801, "learning_rate": 0.0002, "epoch": 4.571428571428571, "step": 2000}, {"loss": 1.2874, "grad_norm": 0.93436598777771, "learning_rate": 0.0002, "epoch": 4.594285714285714, "step": 2010}, {"loss": 1.1965, "grad_norm": 0.8704655766487122, "learning_rate": 0.0002, "epoch": 4.617142857142857, "step": 2020}, {"loss": 1.204, "grad_norm": 0.9077927470207214, "learning_rate": 0.0002, "epoch": 4.64, "step": 2030}, {"loss": 1.2198, "grad_norm": 0.912987470626831, "learning_rate": 0.0002, "epoch": 4.662857142857143, "step": 2040}, {"loss": 1.2868, "grad_norm": 0.9740643501281738, "learning_rate": 0.0002, "epoch": 4.685714285714286, "step": 2050}, {"loss": 1.249, "grad_norm": 1.133357048034668, "learning_rate": 0.0002, "epoch": 4.708571428571428, "step": 2060}, {"loss": 1.1974, "grad_norm": 0.8844527006149292, "learning_rate": 0.0002, "epoch": 4.731428571428571, "step": 2070}, {"loss": 1.2481, "grad_norm": 1.0083311796188354, "learning_rate": 0.0002, "epoch": 4.7542857142857144, "step": 2080}, {"loss": 1.263, "grad_norm": 1.000447154045105, "learning_rate": 0.0002, "epoch": 4.777142857142858, "step": 2090}, {"loss": 1.2313, "grad_norm": 0.9620300531387329, "learning_rate": 0.0002, "epoch": 4.8, "step": 2100}, {"loss": 1.2659, "grad_norm": 0.9843335151672363, "learning_rate": 0.0002, "epoch": 4.822857142857143, "step": 2110}, {"loss": 1.2535, "grad_norm": 0.9906681180000305, "learning_rate": 0.0002, "epoch": 4.845714285714286, "step": 2120}, {"loss": 1.2325, "grad_norm": 0.9544073939323425, "learning_rate": 0.0002, "epoch": 4.868571428571428, "step": 2130}, {"loss": 1.284, "grad_norm": 0.9392994046211243, "learning_rate": 0.0002, "epoch": 4.8914285714285715, "step": 2140}, {"loss": 1.3075, "grad_norm": 1.104519248008728, "learning_rate": 0.0002, "epoch": 4.914285714285715, "step": 2150}, {"loss": 1.2753, "grad_norm": 0.9495956897735596, "learning_rate": 0.0002, "epoch": 4.937142857142857, "step": 2160}, {"loss": 1.2412, "grad_norm": 0.9696287512779236, "learning_rate": 0.0002, "epoch": 4.96, "step": 2170}, {"loss": 1.2354, "grad_norm": 0.9933681488037109, "learning_rate": 0.0002, "epoch": 4.982857142857143, "step": 2180}, {"eval_loss": 2.099808692932129, "eval_runtime": 111.2808, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.575, "epoch": 4.998857142857143, "step": 2187}, {"loss": 1.2183, "grad_norm": 0.9482853412628174, "learning_rate": 0.0002, "epoch": 5.005714285714285, "step": 2190}, {"loss": 0.9898, "grad_norm": 1.6689555644989014, "learning_rate": 0.0002, "epoch": 5.0285714285714285, "step": 2200}, {"loss": 0.9741, "grad_norm": 1.2019699811935425, "learning_rate": 0.0002, "epoch": 5.051428571428572, "step": 2210}, {"loss": 0.9737, "grad_norm": 1.535780429840088, "learning_rate": 0.0002, "epoch": 5.074285714285715, "step": 2220}, {"loss": 0.9494, "grad_norm": 1.2061309814453125, "learning_rate": 0.0002, "epoch": 5.097142857142857, "step": 2230}, {"loss": 0.9316, "grad_norm": 1.1898778676986694, "learning_rate": 0.0002, "epoch": 5.12, "step": 2240}, {"loss": 1.002, "grad_norm": 1.158898949623108, "learning_rate": 0.0002, "epoch": 5.142857142857143, "step": 2250}, {"loss": 0.9715, "grad_norm": 1.370749592781067, "learning_rate": 0.0002, "epoch": 5.1657142857142855, "step": 2260}, {"loss": 0.9365, "grad_norm": 1.314120888710022, "learning_rate": 0.0002, "epoch": 5.188571428571429, "step": 2270}, {"loss": 1.0316, "grad_norm": 1.2184966802597046, "learning_rate": 0.0002, "epoch": 5.211428571428572, "step": 2280}, {"loss": 0.9407, "grad_norm": 1.4833279848098755, "learning_rate": 0.0002, "epoch": 5.234285714285714, "step": 2290}, {"loss": 0.9635, "grad_norm": 1.3348219394683838, "learning_rate": 0.0002, "epoch": 5.257142857142857, "step": 2300}, {"loss": 1.0294, "grad_norm": 1.4166619777679443, "learning_rate": 0.0002, "epoch": 5.28, "step": 2310}, {"loss": 0.9818, "grad_norm": 1.4539530277252197, "learning_rate": 0.0002, "epoch": 5.3028571428571425, "step": 2320}, {"loss": 1.0165, "grad_norm": 1.4642518758773804, "learning_rate": 0.0002, "epoch": 5.325714285714286, "step": 2330}, {"loss": 1.0081, "grad_norm": 1.3938848972320557, "learning_rate": 0.0002, "epoch": 5.348571428571429, "step": 2340}, {"loss": 1.03, "grad_norm": 1.1147894859313965, "learning_rate": 0.0002, "epoch": 5.371428571428572, "step": 2350}, {"loss": 0.9975, "grad_norm": 1.3465309143066406, "learning_rate": 0.0002, "epoch": 5.394285714285714, "step": 2360}, {"loss": 1.0138, "grad_norm": 1.4788566827774048, "learning_rate": 0.0002, "epoch": 5.417142857142857, "step": 2370}, {"loss": 0.9896, "grad_norm": 1.3808705806732178, "learning_rate": 0.0002, "epoch": 5.44, "step": 2380}, {"loss": 1.0279, "grad_norm": 1.2336329221725464, "learning_rate": 0.0002, "epoch": 5.462857142857143, "step": 2390}, {"loss": 0.9763, "grad_norm": 1.5445678234100342, "learning_rate": 0.0002, "epoch": 5.485714285714286, "step": 2400}, {"loss": 0.9534, "grad_norm": 1.107488989830017, "learning_rate": 0.0002, "epoch": 5.508571428571429, "step": 2410}, {"loss": 1.0036, "grad_norm": 1.39687979221344, "learning_rate": 0.0002, "epoch": 5.531428571428571, "step": 2420}, {"loss": 0.9959, "grad_norm": 1.3905695676803589, "learning_rate": 0.0002, "epoch": 5.554285714285714, "step": 2430}, {"loss": 0.9912, "grad_norm": 1.3772821426391602, "learning_rate": 0.0002, "epoch": 5.577142857142857, "step": 2440}, {"loss": 0.9825, "grad_norm": 1.1661899089813232, "learning_rate": 0.0002, "epoch": 5.6, "step": 2450}, {"loss": 1.0003, "grad_norm": 1.2730463743209839, "learning_rate": 0.0002, "epoch": 5.622857142857143, "step": 2460}, {"loss": 1.0433, "grad_norm": 1.2251193523406982, "learning_rate": 0.0002, "epoch": 5.645714285714286, "step": 2470}, {"loss": 1.079, "grad_norm": 1.5454859733581543, "learning_rate": 0.0002, "epoch": 5.668571428571429, "step": 2480}, {"loss": 1.0414, "grad_norm": 1.5405735969543457, "learning_rate": 0.0002, "epoch": 5.691428571428571, "step": 2490}, {"loss": 1.0353, "grad_norm": 1.2555434703826904, "learning_rate": 0.0002, "epoch": 5.714285714285714, "step": 2500}, {"loss": 1.0019, "grad_norm": 1.3323487043380737, "learning_rate": 0.0002, "epoch": 5.737142857142857, "step": 2510}, {"loss": 1.051, "grad_norm": 1.3106356859207153, "learning_rate": 0.0002, "epoch": 5.76, "step": 2520}, {"loss": 1.0248, "grad_norm": 1.4832439422607422, "learning_rate": 0.0002, "epoch": 5.782857142857143, "step": 2530}, {"loss": 1.0643, "grad_norm": 1.1336562633514404, "learning_rate": 0.0002, "epoch": 5.805714285714286, "step": 2540}, {"loss": 1.0446, "grad_norm": 1.2434223890304565, "learning_rate": 0.0002, "epoch": 5.828571428571428, "step": 2550}, {"loss": 1.0467, "grad_norm": 1.2825450897216797, "learning_rate": 0.0002, "epoch": 5.851428571428571, "step": 2560}, {"loss": 1.0642, "grad_norm": 1.4373180866241455, "learning_rate": 0.0002, "epoch": 5.8742857142857146, "step": 2570}, {"loss": 1.0814, "grad_norm": 1.435015320777893, "learning_rate": 0.0002, "epoch": 5.897142857142857, "step": 2580}, {"loss": 1.0272, "grad_norm": 1.4075653553009033, "learning_rate": 0.0002, "epoch": 5.92, "step": 2590}, {"loss": 1.0703, "grad_norm": 1.319630742073059, "learning_rate": 0.0002, "epoch": 5.942857142857143, "step": 2600}, {"loss": 1.0375, "grad_norm": 1.278330683708191, "learning_rate": 0.0002, "epoch": 5.965714285714286, "step": 2610}, {"loss": 1.0766, "grad_norm": 1.258158564567566, "learning_rate": 0.0002, "epoch": 5.988571428571428, "step": 2620}, {"eval_loss": 2.3689301013946533, "eval_runtime": 53.9067, "eval_samples_per_second": 9.405, "eval_steps_per_second": 1.187, "epoch": 6.0, "step": 2625}, {"loss": 0.9142, "grad_norm": 1.3128368854522705, "learning_rate": 0.0002, "epoch": 6.011428571428572, "step": 2630}, {"loss": 0.7716, "grad_norm": 1.4280474185943604, "learning_rate": 0.0002, "epoch": 6.034285714285715, "step": 2640}, {"loss": 0.7776, "grad_norm": 1.5061450004577637, "learning_rate": 0.0002, "epoch": 6.057142857142857, "step": 2650}, {"loss": 0.7707, "grad_norm": 1.6013342142105103, "learning_rate": 0.0002, "epoch": 6.08, "step": 2660}, {"loss": 0.7543, "grad_norm": 2.0107381343841553, "learning_rate": 0.0002, "epoch": 6.102857142857143, "step": 2670}, {"loss": 0.747, "grad_norm": 1.5010124444961548, "learning_rate": 0.0002, "epoch": 6.1257142857142854, "step": 2680}, {"loss": 0.7501, "grad_norm": 1.5222150087356567, "learning_rate": 0.0002, "epoch": 6.148571428571429, "step": 2690}, {"loss": 0.7712, "grad_norm": 1.5413103103637695, "learning_rate": 0.0002, "epoch": 6.171428571428572, "step": 2700}, {"loss": 0.7, "grad_norm": 1.527140736579895, "learning_rate": 0.0002, "epoch": 6.194285714285714, "step": 2710}, {"loss": 0.7539, "grad_norm": 1.9386590719223022, "learning_rate": 0.0002, "epoch": 6.217142857142857, "step": 2720}, {"loss": 0.7586, "grad_norm": 1.8115214109420776, "learning_rate": 0.0002, "epoch": 6.24, "step": 2730}, {"loss": 0.7426, "grad_norm": 1.6221802234649658, "learning_rate": 0.0002, "epoch": 6.2628571428571425, "step": 2740}, {"loss": 0.8002, "grad_norm": 1.6698768138885498, "learning_rate": 0.0002, "epoch": 6.285714285714286, "step": 2750}, {"loss": 0.7293, "grad_norm": 1.7960610389709473, "learning_rate": 0.0002, "epoch": 6.308571428571429, "step": 2760}, {"loss": 0.7405, "grad_norm": 1.32172429561615, "learning_rate": 0.0002, "epoch": 6.331428571428571, "step": 2770}, {"loss": 0.7198, "grad_norm": 1.7468090057373047, "learning_rate": 0.0002, "epoch": 6.354285714285714, "step": 2780}, {"loss": 0.76, "grad_norm": 1.6777397394180298, "learning_rate": 0.0002, "epoch": 6.377142857142857, "step": 2790}, {"loss": 0.7879, "grad_norm": 1.6200671195983887, "learning_rate": 0.0002, "epoch": 6.4, "step": 2800}, {"loss": 0.7807, "grad_norm": 1.723505973815918, "learning_rate": 0.0002, "epoch": 6.422857142857143, "step": 2810}, {"loss": 0.8645, "grad_norm": 1.4945589303970337, "learning_rate": 0.0002, "epoch": 6.445714285714286, "step": 2820}, {"loss": 0.809, "grad_norm": 1.666458010673523, "learning_rate": 0.0002, "epoch": 6.468571428571429, "step": 2830}, {"loss": 0.7996, "grad_norm": 1.6586525440216064, "learning_rate": 0.0002, "epoch": 6.491428571428571, "step": 2840}, {"loss": 0.8062, "grad_norm": 1.7480043172836304, "learning_rate": 0.0002, "epoch": 6.514285714285714, "step": 2850}, {"loss": 0.7602, "grad_norm": 1.4605649709701538, "learning_rate": 0.0002, "epoch": 6.537142857142857, "step": 2860}, {"loss": 0.8186, "grad_norm": 1.4841814041137695, "learning_rate": 0.0002, "epoch": 6.5600000000000005, "step": 2870}, {"loss": 0.8156, "grad_norm": 1.4653114080429077, "learning_rate": 0.0002, "epoch": 6.582857142857143, "step": 2880}, {"loss": 0.8111, "grad_norm": 1.7266837358474731, "learning_rate": 0.0002, "epoch": 6.605714285714286, "step": 2890}, {"loss": 0.7644, "grad_norm": 1.4860098361968994, "learning_rate": 0.0002, "epoch": 6.628571428571428, "step": 2900}, {"loss": 0.7991, "grad_norm": 1.7177597284317017, "learning_rate": 0.0002, "epoch": 6.651428571428571, "step": 2910}, {"loss": 0.7883, "grad_norm": 1.6757104396820068, "learning_rate": 0.0002, "epoch": 6.674285714285714, "step": 2920}, {"loss": 0.8598, "grad_norm": 1.5177433490753174, "learning_rate": 0.0002, "epoch": 6.6971428571428575, "step": 2930}, {"loss": 0.7825, "grad_norm": 1.8073889017105103, "learning_rate": 0.0002, "epoch": 6.72, "step": 2940}, {"loss": 0.8234, "grad_norm": 1.72337007522583, "learning_rate": 0.0002, "epoch": 6.742857142857143, "step": 2950}, {"loss": 0.896, "grad_norm": 1.6298240423202515, "learning_rate": 0.0002, "epoch": 6.765714285714286, "step": 2960}, {"loss": 0.8252, "grad_norm": 1.6140344142913818, "learning_rate": 0.0002, "epoch": 6.788571428571428, "step": 2970}, {"loss": 0.8314, "grad_norm": 1.7180862426757812, "learning_rate": 0.0002, "epoch": 6.811428571428571, "step": 2980}, {"loss": 0.7929, "grad_norm": 1.7589894533157349, "learning_rate": 0.0002, "epoch": 6.8342857142857145, "step": 2990}, {"loss": 0.828, "grad_norm": 1.780195713043213, "learning_rate": 0.0002, "epoch": 6.857142857142857, "step": 3000}, {"loss": 0.8943, "grad_norm": 1.7182508707046509, "learning_rate": 0.0002, "epoch": 6.88, "step": 3010}, {"loss": 0.7964, "grad_norm": 1.6308406591415405, "learning_rate": 0.0002, "epoch": 6.902857142857143, "step": 3020}, {"loss": 0.8207, "grad_norm": 1.5080229043960571, "learning_rate": 0.0002, "epoch": 6.925714285714285, "step": 3030}, {"loss": 0.886, "grad_norm": 1.623555064201355, "learning_rate": 0.0002, "epoch": 6.948571428571428, "step": 3040}, {"loss": 0.8377, "grad_norm": 1.526054859161377, "learning_rate": 0.0002, "epoch": 6.9714285714285715, "step": 3050}, {"loss": 0.8816, "grad_norm": 1.6671174764633179, "learning_rate": 0.0002, "epoch": 6.994285714285715, "step": 3060}, {"eval_loss": 2.647613525390625, "eval_runtime": 111.2255, "eval_samples_per_second": 4.558, "eval_steps_per_second": 0.575, "epoch": 6.998857142857143, "step": 3062}, {"loss": 0.6303, "grad_norm": 1.9154540300369263, "learning_rate": 0.0002, "epoch": 7.017142857142857, "step": 3070}, {"loss": 0.5254, "grad_norm": 2.1938717365264893, "learning_rate": 0.0002, "epoch": 7.04, "step": 3080}, {"loss": 0.5087, "grad_norm": 1.7861053943634033, "learning_rate": 0.0002, "epoch": 7.062857142857143, "step": 3090}, {"loss": 0.5751, "grad_norm": 2.096458911895752, "learning_rate": 0.0002, "epoch": 7.085714285714285, "step": 3100}, {"loss": 0.5188, "grad_norm": 2.0057616233825684, "learning_rate": 0.0002, "epoch": 7.1085714285714285, "step": 3110}, {"loss": 0.5544, "grad_norm": 1.7073354721069336, "learning_rate": 0.0002, "epoch": 7.131428571428572, "step": 3120}, {"loss": 0.6005, "grad_norm": 2.3477938175201416, "learning_rate": 0.0002, "epoch": 7.154285714285714, "step": 3130}, {"loss": 0.5303, "grad_norm": 2.0903899669647217, "learning_rate": 0.0002, "epoch": 7.177142857142857, "step": 3140}, {"loss": 0.5397, "grad_norm": 1.7363157272338867, "learning_rate": 0.0002, "epoch": 7.2, "step": 3150}, {"loss": 0.5491, "grad_norm": 2.0611023902893066, "learning_rate": 0.0002, "epoch": 7.222857142857142, "step": 3160}, {"loss": 0.5865, "grad_norm": 2.404407501220703, "learning_rate": 0.0002, "epoch": 7.2457142857142856, "step": 3170}, {"loss": 0.5743, "grad_norm": 2.1841039657592773, "learning_rate": 0.0002, "epoch": 7.268571428571429, "step": 3180}, {"loss": 0.6604, "grad_norm": 1.7582741975784302, "learning_rate": 0.0002, "epoch": 7.291428571428572, "step": 3190}, {"loss": 0.5697, "grad_norm": 1.8890602588653564, "learning_rate": 0.0002, "epoch": 7.314285714285714, "step": 3200}, {"loss": 0.5969, "grad_norm": 1.8433198928833008, "learning_rate": 0.0002, "epoch": 7.337142857142857, "step": 3210}, {"loss": 0.6019, "grad_norm": 1.652266263961792, "learning_rate": 0.0002, "epoch": 7.36, "step": 3220}, {"loss": 0.5439, "grad_norm": 1.914348840713501, "learning_rate": 0.0002, "epoch": 7.382857142857143, "step": 3230}, {"loss": 0.6198, "grad_norm": 1.7440582513809204, "learning_rate": 0.0002, "epoch": 7.405714285714286, "step": 3240}, {"loss": 0.6022, "grad_norm": 1.9745666980743408, "learning_rate": 0.0002, "epoch": 7.428571428571429, "step": 3250}, {"loss": 0.5512, "grad_norm": 1.6567715406417847, "learning_rate": 0.0002, "epoch": 7.451428571428571, "step": 3260}, {"loss": 0.5985, "grad_norm": 1.5239425897598267, "learning_rate": 0.0002, "epoch": 7.474285714285714, "step": 3270}, {"loss": 0.5944, "grad_norm": 2.0668740272521973, "learning_rate": 0.0002, "epoch": 7.497142857142857, "step": 3280}, {"loss": 0.5424, "grad_norm": 1.9551687240600586, "learning_rate": 0.0002, "epoch": 7.52, "step": 3290}, {"loss": 0.6521, "grad_norm": 2.276602268218994, "learning_rate": 0.0002, "epoch": 7.542857142857143, "step": 3300}, {"loss": 0.6183, "grad_norm": 1.9060227870941162, "learning_rate": 0.0002, "epoch": 7.565714285714286, "step": 3310}, {"loss": 0.6151, "grad_norm": 2.0276358127593994, "learning_rate": 0.0002, "epoch": 7.588571428571429, "step": 3320}, {"loss": 0.5731, "grad_norm": 2.037238121032715, "learning_rate": 0.0002, "epoch": 7.611428571428571, "step": 3330}, {"loss": 0.6522, "grad_norm": 2.0060055255889893, "learning_rate": 0.0002, "epoch": 7.634285714285714, "step": 3340}, {"loss": 0.6275, "grad_norm": 1.8366512060165405, "learning_rate": 0.0002, "epoch": 7.6571428571428575, "step": 3350}, {"loss": 0.6535, "grad_norm": 2.0789284706115723, "learning_rate": 0.0002, "epoch": 7.68, "step": 3360}, {"loss": 0.6197, "grad_norm": 2.137089490890503, "learning_rate": 0.0002, "epoch": 7.702857142857143, "step": 3370}, {"loss": 0.6267, "grad_norm": 1.829277753829956, "learning_rate": 0.0002, "epoch": 7.725714285714286, "step": 3380}, {"loss": 0.6567, "grad_norm": 1.9483778476715088, "learning_rate": 0.0002, "epoch": 7.748571428571428, "step": 3390}, {"loss": 0.6393, "grad_norm": 2.0347065925598145, "learning_rate": 0.0002, "epoch": 7.771428571428571, "step": 3400}, {"loss": 0.662, "grad_norm": 2.0142312049865723, "learning_rate": 0.0002, "epoch": 7.7942857142857145, "step": 3410}, {"loss": 0.6349, "grad_norm": 2.152569055557251, "learning_rate": 0.0002, "epoch": 7.817142857142857, "step": 3420}, {"loss": 0.6805, "grad_norm": 1.7300190925598145, "learning_rate": 0.0002, "epoch": 7.84, "step": 3430}, {"loss": 0.6538, "grad_norm": 2.3944954872131348, "learning_rate": 0.0002, "epoch": 7.862857142857143, "step": 3440}, {"loss": 0.6187, "grad_norm": 2.1004269123077393, "learning_rate": 0.0002, "epoch": 7.885714285714286, "step": 3450}, {"loss": 0.6626, "grad_norm": 2.05513072013855, "learning_rate": 0.0002, "epoch": 7.908571428571428, "step": 3460}, {"loss": 0.637, "grad_norm": 1.9822633266448975, "learning_rate": 0.0002, "epoch": 7.9314285714285715, "step": 3470}, {"loss": 0.6663, "grad_norm": 1.9649063348770142, "learning_rate": 0.0002, "epoch": 7.954285714285715, "step": 3480}, {"loss": 0.6192, "grad_norm": 1.7002657651901245, "learning_rate": 0.0002, "epoch": 7.977142857142857, "step": 3490}]}