diff --git a/.gitattributes b/.gitattributes index 5ecdab0915a1d5a508ce4edd5c306de850e1ac1f..e7fddcd5fc197e678828bc6b231f93bf39d3c900 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1534,3 +1534,12 @@ gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a- gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-617-sd-4/checkpoint-496/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-617-sd-4/checkpoint-62/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-617-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..880901fe41029fff54981e67b19f9562a7d4ebdd --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f63771989885dd3030480c888f982b482a4f0b6609b4f6205f6bf9b1faaccaf +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..880901fe41029fff54981e67b19f9562a7d4ebdd --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f63771989885dd3030480c888f982b482a4f0b6609b4f6205f6bf9b1faaccaf +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..deef6ae3165558586e01373aa39ec4c0c4b773b5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a14bd7bd809dcf34e3681aa306e3332e2ce2c7c355217ad047c7a5749bf95e6 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e1b09a5dbcfe048eb44a413857b39f43d94d5e0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4eb9f332793c665c686d8296af92d4a1ded65117b574b2bd8d775e5ba49b28 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..08608f46a39eb77cc111483d0da7109bea8c5c76 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:660ab54d0c298f22afda4a2655936aecd813daf93f4fc040b7d285b3684be4c0 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a87f6d30d6fd3f89708ffc32c0c21f542f80d156 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/trainer_state.json @@ -0,0 +1,755 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 0.9995129079396006, + "eval_steps": 10, + "global_step": 1026, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.272488488730624e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..12ef772eb1159ae731240b0f1e1856ca61f42cea --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a48a7bbe9dd3ba0a69f9915246bc8c09d59eb48fa97982bb17ecae6a0e4574 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca3b9296d0df17d668eb413821dfcd30d49fbb9e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3561b40d2f24fb85b054ab9b43e627a22934108af1a380436def56674ca11441 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4921bc38e6acd53255e0202b75480b41356aaf87 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceaa5adcb527c2f297d40f7bfac956b3d7389765453573b67579e3b48bff060c +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c3b9bed50751c13ee5b8e48c5fc22e572d60c34 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ebded2a9aa7f2379ce88e70901f21f9672c01c5b0e90fa087e6a51f5eb1d78 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4dcc27d0929dcf8f5c9668d87e2099c83dc75c6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/trainer_state.json @@ -0,0 +1,1484 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2053, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0544976977461248e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-2053/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65512bc6f3fc67be900be6e35a62a1e7559f14f7 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5e11916d1dac910d80aeb5e2e29db4a82ddf794e374a22fe3426b5f5037067 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..04141e14e46c60fdcd9cc06b6c7f1bd83814ffd6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c930d03d2d48904855c717f28e9f06f0cb2b65b44e88d53f235435c21b001df +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..487e81911a510e49fb7a86bcbacbf01e4556acc7 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d766ed32a27c4226af4dbeab8fff7fa52d389f2c0263d7c8cc30e1892c1893e +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae86a8a175903eaea7051749a24e8d86d3be8b2d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024b0220d9692f849dabe772d5f3ffd009410a23eae736dd4e3dfd454a3e41f0 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7398c82d8e9f11607983783ceaa89c997c9a1ab --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/trainer_state.json @@ -0,0 +1,2206 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 2.9995129079396006, + "eval_steps": 10, + "global_step": 3079, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5817465466191872e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-3079/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a1b673a485c6d49b2f1c19fca7dcff22c7519fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eecae5ade18bceb41c2e2d950888a3c5fc7c7eb9b22f7a79be673d57aeebc0cb +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..feb097f02a656db7a8afdfa9cf15ef42b4f89fe6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a02776176843d555f9ac137f9ee80709af7c0bc6d6bd635d984a5ad01b6d8f19 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..363a8cd676b3c311e6111610f15fb2adcb6763c5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2964ed1ed02c299e8f77f89417ef45f32346df96327ad950efe91cc89ca0ef0 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..39e10b564e4ee08c5e32273c2e11f793fb847cf6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be842ef516859975dd85baf06d955a7d7408a2bc63646e1b3e2cff0eba1c3164 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9dba95477b6a4a28f5f4583e87319d4dbae3ca0b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/trainer_state.json @@ -0,0 +1,2935 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 4106, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + }, + { + "epoch": 3.0004870920603994, + "grad_norm": 0.49992576241493225, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3080 + }, + { + "epoch": 3.0102289332683876, + "grad_norm": 0.8242783546447754, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3090 + }, + { + "epoch": 3.019970774476376, + "grad_norm": 0.6330569386482239, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3100 + }, + { + "epoch": 3.0297126156843643, + "grad_norm": 0.566097617149353, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 3110 + }, + { + "epoch": 3.0394544568923525, + "grad_norm": 0.6337586045265198, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 3120 + }, + { + "epoch": 3.049196298100341, + "grad_norm": 0.7339403033256531, + "learning_rate": 0.0002, + "loss": 1.3916, + "step": 3130 + }, + { + "epoch": 3.0589381393083293, + "grad_norm": 0.7187346816062927, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 3140 + }, + { + "epoch": 3.0686799805163174, + "grad_norm": 0.7116255760192871, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3150 + }, + { + "epoch": 3.078421821724306, + "grad_norm": 0.6493807435035706, + "learning_rate": 0.0002, + "loss": 1.4452, + "step": 3160 + }, + { + "epoch": 3.088163662932294, + "grad_norm": 0.6777266263961792, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 3170 + }, + { + "epoch": 3.0979055041402823, + "grad_norm": 0.6342006325721741, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 3180 + }, + { + "epoch": 3.107647345348271, + "grad_norm": 0.6608964204788208, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 3190 + }, + { + "epoch": 3.117389186556259, + "grad_norm": 0.7230247259140015, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 3200 + }, + { + "epoch": 3.1271310277642472, + "grad_norm": 0.650368332862854, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 3210 + }, + { + "epoch": 3.136872868972236, + "grad_norm": 0.7319342494010925, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 3220 + }, + { + "epoch": 3.146614710180224, + "grad_norm": 0.7159963846206665, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 3230 + }, + { + "epoch": 3.156356551388212, + "grad_norm": 0.8905230164527893, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 3240 + }, + { + "epoch": 3.1660983925962007, + "grad_norm": 0.6920804381370544, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 3250 + }, + { + "epoch": 3.175840233804189, + "grad_norm": 0.6782063841819763, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 3260 + }, + { + "epoch": 3.1855820750121775, + "grad_norm": 0.735325276851654, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3270 + }, + { + "epoch": 3.1953239162201656, + "grad_norm": 0.6657978296279907, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 3280 + }, + { + "epoch": 3.205065757428154, + "grad_norm": 0.771315336227417, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 3290 + }, + { + "epoch": 3.2148075986361424, + "grad_norm": 0.6492983102798462, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 3300 + }, + { + "epoch": 3.2245494398441306, + "grad_norm": 0.7513770461082458, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3310 + }, + { + "epoch": 3.2342912810521187, + "grad_norm": 0.7091423869132996, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 3320 + }, + { + "epoch": 3.2440331222601073, + "grad_norm": 0.6663975119590759, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 3330 + }, + { + "epoch": 3.2537749634680955, + "grad_norm": 0.6813122034072876, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 3340 + }, + { + "epoch": 3.2635168046760836, + "grad_norm": 0.6602569818496704, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 3350 + }, + { + "epoch": 3.2732586458840722, + "grad_norm": 0.718270480632782, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 3360 + }, + { + "epoch": 3.2830004870920604, + "grad_norm": 0.6884173154830933, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 3370 + }, + { + "epoch": 3.2927423283000485, + "grad_norm": 0.7039775848388672, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3380 + }, + { + "epoch": 3.302484169508037, + "grad_norm": 0.7444299459457397, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3390 + }, + { + "epoch": 3.3122260107160253, + "grad_norm": 0.7187064290046692, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 3400 + }, + { + "epoch": 3.3219678519240134, + "grad_norm": 0.599396288394928, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3410 + }, + { + "epoch": 3.331709693132002, + "grad_norm": 0.7670390009880066, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3420 + }, + { + "epoch": 3.34145153433999, + "grad_norm": 0.6654478311538696, + "learning_rate": 0.0002, + "loss": 1.4411, + "step": 3430 + }, + { + "epoch": 3.351193375547979, + "grad_norm": 0.6644385457038879, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 3440 + }, + { + "epoch": 3.360935216755967, + "grad_norm": 0.6974098086357117, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3450 + }, + { + "epoch": 3.370677057963955, + "grad_norm": 0.7350399494171143, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 3460 + }, + { + "epoch": 3.3804188991719437, + "grad_norm": 0.714721143245697, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 3470 + }, + { + "epoch": 3.390160740379932, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 3480 + }, + { + "epoch": 3.39990258158792, + "grad_norm": 0.6767925024032593, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3490 + }, + { + "epoch": 3.4096444227959086, + "grad_norm": 0.6721355319023132, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 3500 + }, + { + "epoch": 3.419386264003897, + "grad_norm": 0.6845725178718567, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3510 + }, + { + "epoch": 3.429128105211885, + "grad_norm": 0.6882196664810181, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 3520 + }, + { + "epoch": 3.4388699464198735, + "grad_norm": 0.7663240432739258, + "learning_rate": 0.0002, + "loss": 1.4962, + "step": 3530 + }, + { + "epoch": 3.4486117876278617, + "grad_norm": 0.6304219365119934, + "learning_rate": 0.0002, + "loss": 1.4644, + "step": 3540 + }, + { + "epoch": 3.45835362883585, + "grad_norm": 0.668678879737854, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3550 + }, + { + "epoch": 3.4680954700438384, + "grad_norm": 0.7526912093162537, + "learning_rate": 0.0002, + "loss": 1.4874, + "step": 3560 + }, + { + "epoch": 3.4778373112518266, + "grad_norm": 1.089495301246643, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 3570 + }, + { + "epoch": 3.4875791524598148, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 3580 + }, + { + "epoch": 3.4973209936678034, + "grad_norm": 0.6540156602859497, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3590 + }, + { + "epoch": 3.5070628348757915, + "grad_norm": 0.6449568867683411, + "learning_rate": 0.0002, + "loss": 1.4367, + "step": 3600 + }, + { + "epoch": 3.5168046760837797, + "grad_norm": 0.7262216210365295, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 3610 + }, + { + "epoch": 3.5265465172917683, + "grad_norm": 0.6048615574836731, + "learning_rate": 0.0002, + "loss": 1.4374, + "step": 3620 + }, + { + "epoch": 3.5362883584997564, + "grad_norm": 0.6780537366867065, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 3630 + }, + { + "epoch": 3.5460301997077446, + "grad_norm": 0.6851925253868103, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 3640 + }, + { + "epoch": 3.555772040915733, + "grad_norm": 0.6530634164810181, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3650 + }, + { + "epoch": 3.5655138821237213, + "grad_norm": 0.7193992733955383, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3660 + }, + { + "epoch": 3.5752557233317095, + "grad_norm": 0.767496645450592, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 3670 + }, + { + "epoch": 3.584997564539698, + "grad_norm": 0.6912919282913208, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 3680 + }, + { + "epoch": 3.5947394057476862, + "grad_norm": 0.7383436560630798, + "learning_rate": 0.0002, + "loss": 1.4497, + "step": 3690 + }, + { + "epoch": 3.6044812469556744, + "grad_norm": 0.6746662855148315, + "learning_rate": 0.0002, + "loss": 1.4822, + "step": 3700 + }, + { + "epoch": 3.614223088163663, + "grad_norm": 0.6885138750076294, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3710 + }, + { + "epoch": 3.623964929371651, + "grad_norm": 0.6694392561912537, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3720 + }, + { + "epoch": 3.6337067705796393, + "grad_norm": 0.812358021736145, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 3730 + }, + { + "epoch": 3.643448611787628, + "grad_norm": 0.7267130017280579, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 3740 + }, + { + "epoch": 3.653190452995616, + "grad_norm": 0.6958749294281006, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3750 + }, + { + "epoch": 3.6629322942036042, + "grad_norm": 0.6805673241615295, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3760 + }, + { + "epoch": 3.672674135411593, + "grad_norm": 0.7184410095214844, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3770 + }, + { + "epoch": 3.682415976619581, + "grad_norm": 0.7716330289840698, + "learning_rate": 0.0002, + "loss": 1.3935, + "step": 3780 + }, + { + "epoch": 3.6921578178275696, + "grad_norm": 0.6675831079483032, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3790 + }, + { + "epoch": 3.7018996590355577, + "grad_norm": 0.6480095386505127, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 3800 + }, + { + "epoch": 3.711641500243546, + "grad_norm": 0.6559418439865112, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 3810 + }, + { + "epoch": 3.7213833414515345, + "grad_norm": 0.6596545577049255, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 3820 + }, + { + "epoch": 3.7311251826595226, + "grad_norm": 0.7172950506210327, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3830 + }, + { + "epoch": 3.740867023867511, + "grad_norm": 0.796148419380188, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 3840 + }, + { + "epoch": 3.7506088650754994, + "grad_norm": 0.6600322723388672, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 3850 + }, + { + "epoch": 3.7603507062834876, + "grad_norm": 0.6776387691497803, + "learning_rate": 0.0002, + "loss": 1.4201, + "step": 3860 + }, + { + "epoch": 3.770092547491476, + "grad_norm": 0.7768304347991943, + "learning_rate": 0.0002, + "loss": 1.3893, + "step": 3870 + }, + { + "epoch": 3.7798343886994643, + "grad_norm": 1.0579794645309448, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 3880 + }, + { + "epoch": 3.7895762299074525, + "grad_norm": 0.6757252812385559, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 3890 + }, + { + "epoch": 3.799318071115441, + "grad_norm": 0.6706996560096741, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3900 + }, + { + "epoch": 3.809059912323429, + "grad_norm": 0.7026948928833008, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 3910 + }, + { + "epoch": 3.8188017535314174, + "grad_norm": 0.6437768340110779, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 3920 + }, + { + "epoch": 3.828543594739406, + "grad_norm": 0.7015706300735474, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 3930 + }, + { + "epoch": 3.838285435947394, + "grad_norm": 0.7049482464790344, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 3940 + }, + { + "epoch": 3.8480272771553823, + "grad_norm": 0.6533724665641785, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3950 + }, + { + "epoch": 3.857769118363371, + "grad_norm": 0.7312499284744263, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 3960 + }, + { + "epoch": 3.867510959571359, + "grad_norm": 0.6858801245689392, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3970 + }, + { + "epoch": 3.877252800779347, + "grad_norm": 0.770423173904419, + "learning_rate": 0.0002, + "loss": 1.4423, + "step": 3980 + }, + { + "epoch": 3.886994641987336, + "grad_norm": 0.6987539529800415, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 3990 + }, + { + "epoch": 3.896736483195324, + "grad_norm": 0.7072722315788269, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 4000 + }, + { + "epoch": 3.906478324403312, + "grad_norm": 0.6492931842803955, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4010 + }, + { + "epoch": 3.9162201656113007, + "grad_norm": 0.7716232538223267, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 4020 + }, + { + "epoch": 3.925962006819289, + "grad_norm": 0.722949743270874, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 4030 + }, + { + "epoch": 3.935703848027277, + "grad_norm": 0.7434365749359131, + "learning_rate": 0.0002, + "loss": 1.3914, + "step": 4040 + }, + { + "epoch": 3.9454456892352656, + "grad_norm": 0.6691509485244751, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 4050 + }, + { + "epoch": 3.9551875304432538, + "grad_norm": 0.6850284337997437, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4060 + }, + { + "epoch": 3.964929371651242, + "grad_norm": 0.6954452991485596, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4070 + }, + { + "epoch": 3.9746712128592305, + "grad_norm": 0.9316364526748657, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 4080 + }, + { + "epoch": 3.9844130540672187, + "grad_norm": 0.6908289194107056, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 4090 + }, + { + "epoch": 3.994154895275207, + "grad_norm": 0.666782021522522, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 4100 + }, + { + "epoch": 4.0, + "eval_loss": 1.9233275651931763, + "eval_runtime": 55.9536, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 1.144, + "step": 4106 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1089953954922496e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-4106/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b487f7049f5f340d2abdc9a44162faf6a13b70b9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9468b2f6834f3c73875087280b883ab24849e7b2b49ef8c169afbe03f80f5bcc +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5690f8f751d22e5100cb193b6be5590f574b458c --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82d4f36b5067df6d0c71805aba92215fb1c159bbcde1d015e7bc6da17a4128e +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..269e4a936375ac7abfdf34c6924ac370d03a5531 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ce99a0a69cc8a4b3b9d68161778aab5768b17b0e4746fc85ca7b8d630056f4 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0102d21fdb3705adf3dfdbf03459433cceb205d6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450ddc08f4bb85710f00f5916b9609332f1a6e377e9adbadf971157424df885f +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb8b1599d11a61b9e38eb84d83cf9f571ad064f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/trainer_state.json @@ -0,0 +1,3664 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 4.9995129079396, + "eval_steps": 10, + "global_step": 5132, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + }, + { + "epoch": 3.0004870920603994, + "grad_norm": 0.49992576241493225, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3080 + }, + { + "epoch": 3.0102289332683876, + "grad_norm": 0.8242783546447754, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3090 + }, + { + "epoch": 3.019970774476376, + "grad_norm": 0.6330569386482239, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3100 + }, + { + "epoch": 3.0297126156843643, + "grad_norm": 0.566097617149353, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 3110 + }, + { + "epoch": 3.0394544568923525, + "grad_norm": 0.6337586045265198, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 3120 + }, + { + "epoch": 3.049196298100341, + "grad_norm": 0.7339403033256531, + "learning_rate": 0.0002, + "loss": 1.3916, + "step": 3130 + }, + { + "epoch": 3.0589381393083293, + "grad_norm": 0.7187346816062927, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 3140 + }, + { + "epoch": 3.0686799805163174, + "grad_norm": 0.7116255760192871, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3150 + }, + { + "epoch": 3.078421821724306, + "grad_norm": 0.6493807435035706, + "learning_rate": 0.0002, + "loss": 1.4452, + "step": 3160 + }, + { + "epoch": 3.088163662932294, + "grad_norm": 0.6777266263961792, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 3170 + }, + { + "epoch": 3.0979055041402823, + "grad_norm": 0.6342006325721741, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 3180 + }, + { + "epoch": 3.107647345348271, + "grad_norm": 0.6608964204788208, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 3190 + }, + { + "epoch": 3.117389186556259, + "grad_norm": 0.7230247259140015, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 3200 + }, + { + "epoch": 3.1271310277642472, + "grad_norm": 0.650368332862854, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 3210 + }, + { + "epoch": 3.136872868972236, + "grad_norm": 0.7319342494010925, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 3220 + }, + { + "epoch": 3.146614710180224, + "grad_norm": 0.7159963846206665, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 3230 + }, + { + "epoch": 3.156356551388212, + "grad_norm": 0.8905230164527893, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 3240 + }, + { + "epoch": 3.1660983925962007, + "grad_norm": 0.6920804381370544, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 3250 + }, + { + "epoch": 3.175840233804189, + "grad_norm": 0.6782063841819763, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 3260 + }, + { + "epoch": 3.1855820750121775, + "grad_norm": 0.735325276851654, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3270 + }, + { + "epoch": 3.1953239162201656, + "grad_norm": 0.6657978296279907, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 3280 + }, + { + "epoch": 3.205065757428154, + "grad_norm": 0.771315336227417, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 3290 + }, + { + "epoch": 3.2148075986361424, + "grad_norm": 0.6492983102798462, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 3300 + }, + { + "epoch": 3.2245494398441306, + "grad_norm": 0.7513770461082458, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3310 + }, + { + "epoch": 3.2342912810521187, + "grad_norm": 0.7091423869132996, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 3320 + }, + { + "epoch": 3.2440331222601073, + "grad_norm": 0.6663975119590759, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 3330 + }, + { + "epoch": 3.2537749634680955, + "grad_norm": 0.6813122034072876, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 3340 + }, + { + "epoch": 3.2635168046760836, + "grad_norm": 0.6602569818496704, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 3350 + }, + { + "epoch": 3.2732586458840722, + "grad_norm": 0.718270480632782, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 3360 + }, + { + "epoch": 3.2830004870920604, + "grad_norm": 0.6884173154830933, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 3370 + }, + { + "epoch": 3.2927423283000485, + "grad_norm": 0.7039775848388672, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3380 + }, + { + "epoch": 3.302484169508037, + "grad_norm": 0.7444299459457397, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3390 + }, + { + "epoch": 3.3122260107160253, + "grad_norm": 0.7187064290046692, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 3400 + }, + { + "epoch": 3.3219678519240134, + "grad_norm": 0.599396288394928, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3410 + }, + { + "epoch": 3.331709693132002, + "grad_norm": 0.7670390009880066, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3420 + }, + { + "epoch": 3.34145153433999, + "grad_norm": 0.6654478311538696, + "learning_rate": 0.0002, + "loss": 1.4411, + "step": 3430 + }, + { + "epoch": 3.351193375547979, + "grad_norm": 0.6644385457038879, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 3440 + }, + { + "epoch": 3.360935216755967, + "grad_norm": 0.6974098086357117, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3450 + }, + { + "epoch": 3.370677057963955, + "grad_norm": 0.7350399494171143, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 3460 + }, + { + "epoch": 3.3804188991719437, + "grad_norm": 0.714721143245697, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 3470 + }, + { + "epoch": 3.390160740379932, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 3480 + }, + { + "epoch": 3.39990258158792, + "grad_norm": 0.6767925024032593, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3490 + }, + { + "epoch": 3.4096444227959086, + "grad_norm": 0.6721355319023132, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 3500 + }, + { + "epoch": 3.419386264003897, + "grad_norm": 0.6845725178718567, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3510 + }, + { + "epoch": 3.429128105211885, + "grad_norm": 0.6882196664810181, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 3520 + }, + { + "epoch": 3.4388699464198735, + "grad_norm": 0.7663240432739258, + "learning_rate": 0.0002, + "loss": 1.4962, + "step": 3530 + }, + { + "epoch": 3.4486117876278617, + "grad_norm": 0.6304219365119934, + "learning_rate": 0.0002, + "loss": 1.4644, + "step": 3540 + }, + { + "epoch": 3.45835362883585, + "grad_norm": 0.668678879737854, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3550 + }, + { + "epoch": 3.4680954700438384, + "grad_norm": 0.7526912093162537, + "learning_rate": 0.0002, + "loss": 1.4874, + "step": 3560 + }, + { + "epoch": 3.4778373112518266, + "grad_norm": 1.089495301246643, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 3570 + }, + { + "epoch": 3.4875791524598148, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 3580 + }, + { + "epoch": 3.4973209936678034, + "grad_norm": 0.6540156602859497, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3590 + }, + { + "epoch": 3.5070628348757915, + "grad_norm": 0.6449568867683411, + "learning_rate": 0.0002, + "loss": 1.4367, + "step": 3600 + }, + { + "epoch": 3.5168046760837797, + "grad_norm": 0.7262216210365295, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 3610 + }, + { + "epoch": 3.5265465172917683, + "grad_norm": 0.6048615574836731, + "learning_rate": 0.0002, + "loss": 1.4374, + "step": 3620 + }, + { + "epoch": 3.5362883584997564, + "grad_norm": 0.6780537366867065, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 3630 + }, + { + "epoch": 3.5460301997077446, + "grad_norm": 0.6851925253868103, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 3640 + }, + { + "epoch": 3.555772040915733, + "grad_norm": 0.6530634164810181, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3650 + }, + { + "epoch": 3.5655138821237213, + "grad_norm": 0.7193992733955383, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3660 + }, + { + "epoch": 3.5752557233317095, + "grad_norm": 0.767496645450592, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 3670 + }, + { + "epoch": 3.584997564539698, + "grad_norm": 0.6912919282913208, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 3680 + }, + { + "epoch": 3.5947394057476862, + "grad_norm": 0.7383436560630798, + "learning_rate": 0.0002, + "loss": 1.4497, + "step": 3690 + }, + { + "epoch": 3.6044812469556744, + "grad_norm": 0.6746662855148315, + "learning_rate": 0.0002, + "loss": 1.4822, + "step": 3700 + }, + { + "epoch": 3.614223088163663, + "grad_norm": 0.6885138750076294, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3710 + }, + { + "epoch": 3.623964929371651, + "grad_norm": 0.6694392561912537, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3720 + }, + { + "epoch": 3.6337067705796393, + "grad_norm": 0.812358021736145, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 3730 + }, + { + "epoch": 3.643448611787628, + "grad_norm": 0.7267130017280579, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 3740 + }, + { + "epoch": 3.653190452995616, + "grad_norm": 0.6958749294281006, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3750 + }, + { + "epoch": 3.6629322942036042, + "grad_norm": 0.6805673241615295, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3760 + }, + { + "epoch": 3.672674135411593, + "grad_norm": 0.7184410095214844, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3770 + }, + { + "epoch": 3.682415976619581, + "grad_norm": 0.7716330289840698, + "learning_rate": 0.0002, + "loss": 1.3935, + "step": 3780 + }, + { + "epoch": 3.6921578178275696, + "grad_norm": 0.6675831079483032, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3790 + }, + { + "epoch": 3.7018996590355577, + "grad_norm": 0.6480095386505127, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 3800 + }, + { + "epoch": 3.711641500243546, + "grad_norm": 0.6559418439865112, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 3810 + }, + { + "epoch": 3.7213833414515345, + "grad_norm": 0.6596545577049255, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 3820 + }, + { + "epoch": 3.7311251826595226, + "grad_norm": 0.7172950506210327, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3830 + }, + { + "epoch": 3.740867023867511, + "grad_norm": 0.796148419380188, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 3840 + }, + { + "epoch": 3.7506088650754994, + "grad_norm": 0.6600322723388672, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 3850 + }, + { + "epoch": 3.7603507062834876, + "grad_norm": 0.6776387691497803, + "learning_rate": 0.0002, + "loss": 1.4201, + "step": 3860 + }, + { + "epoch": 3.770092547491476, + "grad_norm": 0.7768304347991943, + "learning_rate": 0.0002, + "loss": 1.3893, + "step": 3870 + }, + { + "epoch": 3.7798343886994643, + "grad_norm": 1.0579794645309448, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 3880 + }, + { + "epoch": 3.7895762299074525, + "grad_norm": 0.6757252812385559, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 3890 + }, + { + "epoch": 3.799318071115441, + "grad_norm": 0.6706996560096741, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3900 + }, + { + "epoch": 3.809059912323429, + "grad_norm": 0.7026948928833008, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 3910 + }, + { + "epoch": 3.8188017535314174, + "grad_norm": 0.6437768340110779, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 3920 + }, + { + "epoch": 3.828543594739406, + "grad_norm": 0.7015706300735474, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 3930 + }, + { + "epoch": 3.838285435947394, + "grad_norm": 0.7049482464790344, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 3940 + }, + { + "epoch": 3.8480272771553823, + "grad_norm": 0.6533724665641785, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3950 + }, + { + "epoch": 3.857769118363371, + "grad_norm": 0.7312499284744263, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 3960 + }, + { + "epoch": 3.867510959571359, + "grad_norm": 0.6858801245689392, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3970 + }, + { + "epoch": 3.877252800779347, + "grad_norm": 0.770423173904419, + "learning_rate": 0.0002, + "loss": 1.4423, + "step": 3980 + }, + { + "epoch": 3.886994641987336, + "grad_norm": 0.6987539529800415, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 3990 + }, + { + "epoch": 3.896736483195324, + "grad_norm": 0.7072722315788269, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 4000 + }, + { + "epoch": 3.906478324403312, + "grad_norm": 0.6492931842803955, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4010 + }, + { + "epoch": 3.9162201656113007, + "grad_norm": 0.7716232538223267, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 4020 + }, + { + "epoch": 3.925962006819289, + "grad_norm": 0.722949743270874, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 4030 + }, + { + "epoch": 3.935703848027277, + "grad_norm": 0.7434365749359131, + "learning_rate": 0.0002, + "loss": 1.3914, + "step": 4040 + }, + { + "epoch": 3.9454456892352656, + "grad_norm": 0.6691509485244751, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 4050 + }, + { + "epoch": 3.9551875304432538, + "grad_norm": 0.6850284337997437, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4060 + }, + { + "epoch": 3.964929371651242, + "grad_norm": 0.6954452991485596, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4070 + }, + { + "epoch": 3.9746712128592305, + "grad_norm": 0.9316364526748657, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 4080 + }, + { + "epoch": 3.9844130540672187, + "grad_norm": 0.6908289194107056, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 4090 + }, + { + "epoch": 3.994154895275207, + "grad_norm": 0.666782021522522, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 4100 + }, + { + "epoch": 4.0, + "eval_loss": 1.9233275651931763, + "eval_runtime": 55.9536, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 1.144, + "step": 4106 + }, + { + "epoch": 4.003896736483195, + "grad_norm": 0.7726166248321533, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 4110 + }, + { + "epoch": 4.013638577691184, + "grad_norm": 1.1338967084884644, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 4120 + }, + { + "epoch": 4.023380418899172, + "grad_norm": 0.9530029296875, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 4130 + }, + { + "epoch": 4.03312226010716, + "grad_norm": 1.1058554649353027, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 4140 + }, + { + "epoch": 4.042864101315149, + "grad_norm": 0.8765049576759338, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 4150 + }, + { + "epoch": 4.052605942523137, + "grad_norm": 1.1774667501449585, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4160 + }, + { + "epoch": 4.062347783731125, + "grad_norm": 0.9301433563232422, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 4170 + }, + { + "epoch": 4.072089624939114, + "grad_norm": 1.0196778774261475, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 4180 + }, + { + "epoch": 4.081831466147102, + "grad_norm": 1.1380577087402344, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 4190 + }, + { + "epoch": 4.09157330735509, + "grad_norm": 0.9121319651603699, + "learning_rate": 0.0002, + "loss": 1.2521, + "step": 4200 + }, + { + "epoch": 4.101315148563079, + "grad_norm": 0.9495378732681274, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 4210 + }, + { + "epoch": 4.1110569897710665, + "grad_norm": 0.8058680295944214, + "learning_rate": 0.0002, + "loss": 1.1829, + "step": 4220 + }, + { + "epoch": 4.120798830979055, + "grad_norm": 1.000887393951416, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 4230 + }, + { + "epoch": 4.130540672187044, + "grad_norm": 0.9529102444648743, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 4240 + }, + { + "epoch": 4.140282513395031, + "grad_norm": 1.0257115364074707, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 4250 + }, + { + "epoch": 4.15002435460302, + "grad_norm": 0.9590303897857666, + "learning_rate": 0.0002, + "loss": 1.2293, + "step": 4260 + }, + { + "epoch": 4.159766195811009, + "grad_norm": 1.065291166305542, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 4270 + }, + { + "epoch": 4.169508037018996, + "grad_norm": 0.8819697499275208, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4280 + }, + { + "epoch": 4.179249878226985, + "grad_norm": 1.0335261821746826, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 4290 + }, + { + "epoch": 4.1889917194349735, + "grad_norm": 0.8872809410095215, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 4300 + }, + { + "epoch": 4.198733560642961, + "grad_norm": 0.9883159399032593, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 4310 + }, + { + "epoch": 4.20847540185095, + "grad_norm": 1.0254192352294922, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 4320 + }, + { + "epoch": 4.218217243058938, + "grad_norm": 0.9432600736618042, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 4330 + }, + { + "epoch": 4.227959084266926, + "grad_norm": 1.1008676290512085, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4340 + }, + { + "epoch": 4.237700925474915, + "grad_norm": 1.0829699039459229, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 4350 + }, + { + "epoch": 4.247442766682903, + "grad_norm": 1.016847848892212, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4360 + }, + { + "epoch": 4.257184607890891, + "grad_norm": 0.8924864530563354, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 4370 + }, + { + "epoch": 4.26692644909888, + "grad_norm": 0.9300530552864075, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 4380 + }, + { + "epoch": 4.276668290306868, + "grad_norm": 0.9684814810752869, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 4390 + }, + { + "epoch": 4.286410131514856, + "grad_norm": 0.9916250705718994, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 4400 + }, + { + "epoch": 4.2961519727228445, + "grad_norm": 0.903680145740509, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 4410 + }, + { + "epoch": 4.305893813930833, + "grad_norm": 0.8713505268096924, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 4420 + }, + { + "epoch": 4.315635655138821, + "grad_norm": 0.9983905553817749, + "learning_rate": 0.0002, + "loss": 1.1957, + "step": 4430 + }, + { + "epoch": 4.3253774963468095, + "grad_norm": 1.1689040660858154, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 4440 + }, + { + "epoch": 4.335119337554798, + "grad_norm": 0.9316853880882263, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 4450 + }, + { + "epoch": 4.344861178762786, + "grad_norm": 0.9175887107849121, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 4460 + }, + { + "epoch": 4.354603019970774, + "grad_norm": 0.9348906874656677, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4470 + }, + { + "epoch": 4.364344861178763, + "grad_norm": 0.9727016687393188, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 4480 + }, + { + "epoch": 4.374086702386751, + "grad_norm": 0.9843429923057556, + "learning_rate": 0.0002, + "loss": 1.2616, + "step": 4490 + }, + { + "epoch": 4.383828543594739, + "grad_norm": 0.9615852236747742, + "learning_rate": 0.0002, + "loss": 1.2488, + "step": 4500 + }, + { + "epoch": 4.393570384802728, + "grad_norm": 0.9688583612442017, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 4510 + }, + { + "epoch": 4.403312226010716, + "grad_norm": 0.9933668375015259, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 4520 + }, + { + "epoch": 4.413054067218704, + "grad_norm": 1.0626686811447144, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4530 + }, + { + "epoch": 4.422795908426693, + "grad_norm": 0.9536267518997192, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 4540 + }, + { + "epoch": 4.432537749634681, + "grad_norm": 0.9777140021324158, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4550 + }, + { + "epoch": 4.442279590842669, + "grad_norm": 0.980780839920044, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 4560 + }, + { + "epoch": 4.452021432050658, + "grad_norm": 1.0147196054458618, + "learning_rate": 0.0002, + "loss": 1.2597, + "step": 4570 + }, + { + "epoch": 4.461763273258645, + "grad_norm": 0.9763361811637878, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 4580 + }, + { + "epoch": 4.471505114466634, + "grad_norm": 1.0300798416137695, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 4590 + }, + { + "epoch": 4.481246955674623, + "grad_norm": 0.8833121657371521, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 4600 + }, + { + "epoch": 4.490988796882611, + "grad_norm": 1.1214020252227783, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 4610 + }, + { + "epoch": 4.500730638090599, + "grad_norm": 0.8843787908554077, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 4620 + }, + { + "epoch": 4.5104724792985875, + "grad_norm": 0.9942020773887634, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 4630 + }, + { + "epoch": 4.520214320506576, + "grad_norm": 1.0033202171325684, + "learning_rate": 0.0002, + "loss": 1.3172, + "step": 4640 + }, + { + "epoch": 4.529956161714564, + "grad_norm": 0.8767235279083252, + "learning_rate": 0.0002, + "loss": 1.2024, + "step": 4650 + }, + { + "epoch": 4.539698002922552, + "grad_norm": 1.0117276906967163, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 4660 + }, + { + "epoch": 4.549439844130541, + "grad_norm": 1.2787362337112427, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4670 + }, + { + "epoch": 4.559181685338529, + "grad_norm": 0.8824878931045532, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 4680 + }, + { + "epoch": 4.568923526546517, + "grad_norm": 0.9209560751914978, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4690 + }, + { + "epoch": 4.578665367754506, + "grad_norm": 1.1064010858535767, + "learning_rate": 0.0002, + "loss": 1.1916, + "step": 4700 + }, + { + "epoch": 4.588407208962494, + "grad_norm": 0.8914572596549988, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 4710 + }, + { + "epoch": 4.598149050170482, + "grad_norm": 1.0412265062332153, + "learning_rate": 0.0002, + "loss": 1.2861, + "step": 4720 + }, + { + "epoch": 4.607890891378471, + "grad_norm": 1.1950221061706543, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4730 + }, + { + "epoch": 4.617632732586459, + "grad_norm": 0.8938062787055969, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 4740 + }, + { + "epoch": 4.627374573794447, + "grad_norm": 0.9849569201469421, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4750 + }, + { + "epoch": 4.637116415002436, + "grad_norm": 1.0081515312194824, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4760 + }, + { + "epoch": 4.6468582562104235, + "grad_norm": 0.8566309213638306, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 4770 + }, + { + "epoch": 4.656600097418412, + "grad_norm": 1.1750118732452393, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 4780 + }, + { + "epoch": 4.666341938626401, + "grad_norm": 0.925502598285675, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 4790 + }, + { + "epoch": 4.676083779834388, + "grad_norm": 1.0402472019195557, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 4800 + }, + { + "epoch": 4.685825621042377, + "grad_norm": 0.9772472977638245, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 4810 + }, + { + "epoch": 4.695567462250366, + "grad_norm": 0.9082779288291931, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 4820 + }, + { + "epoch": 4.705309303458353, + "grad_norm": 0.8026862740516663, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 4830 + }, + { + "epoch": 4.715051144666342, + "grad_norm": 1.1631089448928833, + "learning_rate": 0.0002, + "loss": 1.3369, + "step": 4840 + }, + { + "epoch": 4.7247929858743305, + "grad_norm": 0.9384787678718567, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4850 + }, + { + "epoch": 4.734534827082318, + "grad_norm": 1.2151581048965454, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 4860 + }, + { + "epoch": 4.744276668290307, + "grad_norm": 0.9679436087608337, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 4870 + }, + { + "epoch": 4.754018509498295, + "grad_norm": 0.8352158069610596, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 4880 + }, + { + "epoch": 4.763760350706283, + "grad_norm": 1.0205804109573364, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 4890 + }, + { + "epoch": 4.773502191914272, + "grad_norm": 0.9814772605895996, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 4900 + }, + { + "epoch": 4.78324403312226, + "grad_norm": 1.002854347229004, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 4910 + }, + { + "epoch": 4.792985874330248, + "grad_norm": 1.1609505414962769, + "learning_rate": 0.0002, + "loss": 1.3143, + "step": 4920 + }, + { + "epoch": 4.802727715538237, + "grad_norm": 0.9354982376098633, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 4930 + }, + { + "epoch": 4.812469556746225, + "grad_norm": 0.9761685729026794, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 4940 + }, + { + "epoch": 4.822211397954213, + "grad_norm": 1.0604596138000488, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 4950 + }, + { + "epoch": 4.8319532391622015, + "grad_norm": 1.0902808904647827, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 4960 + }, + { + "epoch": 4.84169508037019, + "grad_norm": 1.0174955129623413, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4970 + }, + { + "epoch": 4.851436921578179, + "grad_norm": 1.0995253324508667, + "learning_rate": 0.0002, + "loss": 1.3141, + "step": 4980 + }, + { + "epoch": 4.8611787627861665, + "grad_norm": 0.880993127822876, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4990 + }, + { + "epoch": 4.870920603994155, + "grad_norm": 0.9472237825393677, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 5000 + }, + { + "epoch": 4.880662445202143, + "grad_norm": 0.9504236578941345, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5010 + }, + { + "epoch": 4.890404286410131, + "grad_norm": 1.1261742115020752, + "learning_rate": 0.0002, + "loss": 1.2791, + "step": 5020 + }, + { + "epoch": 4.90014612761812, + "grad_norm": 0.904674768447876, + "learning_rate": 0.0002, + "loss": 1.3707, + "step": 5030 + }, + { + "epoch": 4.909887968826109, + "grad_norm": 0.8828991055488586, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5040 + }, + { + "epoch": 4.919629810034096, + "grad_norm": 1.0156532526016235, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 5050 + }, + { + "epoch": 4.929371651242085, + "grad_norm": 0.8975168466567993, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 5060 + }, + { + "epoch": 4.939113492450073, + "grad_norm": 0.9787213802337646, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 5070 + }, + { + "epoch": 4.948855333658061, + "grad_norm": 1.0801568031311035, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 5080 + }, + { + "epoch": 4.95859717486605, + "grad_norm": 1.0655089616775513, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 5090 + }, + { + "epoch": 4.968339016074038, + "grad_norm": 0.8941320180892944, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 5100 + }, + { + "epoch": 4.978080857282026, + "grad_norm": 1.050621747970581, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 5110 + }, + { + "epoch": 4.987822698490015, + "grad_norm": 0.9724781513214111, + "learning_rate": 0.0002, + "loss": 1.3791, + "step": 5120 + }, + { + "epoch": 4.997564539698003, + "grad_norm": 0.9850538969039917, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 5130 + }, + { + "epoch": 4.9995129079396, + "eval_loss": 2.0824170112609863, + "eval_runtime": 55.592, + "eval_samples_per_second": 9.12, + "eval_steps_per_second": 1.151, + "step": 5132 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.636244244365312e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-5132/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6017ad2003beb2b8b1e6489151fda4210695ba6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05a8db3be673b644ec72212b7426226dfc6a4143b9db1d52d06b769defd957e +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aeb23b37299e91905445e7a634bc47b74af209eb --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc1bf22316ad6f9c423bce00fc88b52faa240878bb818509a7d6bd35e32a04af +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a26dad7804e1958688284db43885b5075b1f773b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3bc5a941d2094fad4f0f385bae3ab22f53e7e0fc54bf4e91b5db83b189760ee +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f64ade66f62200ea3b67d81f19c9b595c8d0b98c --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e10b88e0762b2dc9ec6a69bbd0025951b3a06dcda517a323ae83fcf1ba4038e +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dc395a0a8299d1a55b5b2397714b16d610cd9cc9 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/trainer_state.json @@ -0,0 +1,4386 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 6159, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + }, + { + "epoch": 3.0004870920603994, + "grad_norm": 0.49992576241493225, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3080 + }, + { + "epoch": 3.0102289332683876, + "grad_norm": 0.8242783546447754, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3090 + }, + { + "epoch": 3.019970774476376, + "grad_norm": 0.6330569386482239, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3100 + }, + { + "epoch": 3.0297126156843643, + "grad_norm": 0.566097617149353, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 3110 + }, + { + "epoch": 3.0394544568923525, + "grad_norm": 0.6337586045265198, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 3120 + }, + { + "epoch": 3.049196298100341, + "grad_norm": 0.7339403033256531, + "learning_rate": 0.0002, + "loss": 1.3916, + "step": 3130 + }, + { + "epoch": 3.0589381393083293, + "grad_norm": 0.7187346816062927, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 3140 + }, + { + "epoch": 3.0686799805163174, + "grad_norm": 0.7116255760192871, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3150 + }, + { + "epoch": 3.078421821724306, + "grad_norm": 0.6493807435035706, + "learning_rate": 0.0002, + "loss": 1.4452, + "step": 3160 + }, + { + "epoch": 3.088163662932294, + "grad_norm": 0.6777266263961792, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 3170 + }, + { + "epoch": 3.0979055041402823, + "grad_norm": 0.6342006325721741, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 3180 + }, + { + "epoch": 3.107647345348271, + "grad_norm": 0.6608964204788208, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 3190 + }, + { + "epoch": 3.117389186556259, + "grad_norm": 0.7230247259140015, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 3200 + }, + { + "epoch": 3.1271310277642472, + "grad_norm": 0.650368332862854, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 3210 + }, + { + "epoch": 3.136872868972236, + "grad_norm": 0.7319342494010925, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 3220 + }, + { + "epoch": 3.146614710180224, + "grad_norm": 0.7159963846206665, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 3230 + }, + { + "epoch": 3.156356551388212, + "grad_norm": 0.8905230164527893, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 3240 + }, + { + "epoch": 3.1660983925962007, + "grad_norm": 0.6920804381370544, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 3250 + }, + { + "epoch": 3.175840233804189, + "grad_norm": 0.6782063841819763, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 3260 + }, + { + "epoch": 3.1855820750121775, + "grad_norm": 0.735325276851654, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3270 + }, + { + "epoch": 3.1953239162201656, + "grad_norm": 0.6657978296279907, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 3280 + }, + { + "epoch": 3.205065757428154, + "grad_norm": 0.771315336227417, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 3290 + }, + { + "epoch": 3.2148075986361424, + "grad_norm": 0.6492983102798462, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 3300 + }, + { + "epoch": 3.2245494398441306, + "grad_norm": 0.7513770461082458, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3310 + }, + { + "epoch": 3.2342912810521187, + "grad_norm": 0.7091423869132996, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 3320 + }, + { + "epoch": 3.2440331222601073, + "grad_norm": 0.6663975119590759, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 3330 + }, + { + "epoch": 3.2537749634680955, + "grad_norm": 0.6813122034072876, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 3340 + }, + { + "epoch": 3.2635168046760836, + "grad_norm": 0.6602569818496704, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 3350 + }, + { + "epoch": 3.2732586458840722, + "grad_norm": 0.718270480632782, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 3360 + }, + { + "epoch": 3.2830004870920604, + "grad_norm": 0.6884173154830933, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 3370 + }, + { + "epoch": 3.2927423283000485, + "grad_norm": 0.7039775848388672, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3380 + }, + { + "epoch": 3.302484169508037, + "grad_norm": 0.7444299459457397, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3390 + }, + { + "epoch": 3.3122260107160253, + "grad_norm": 0.7187064290046692, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 3400 + }, + { + "epoch": 3.3219678519240134, + "grad_norm": 0.599396288394928, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3410 + }, + { + "epoch": 3.331709693132002, + "grad_norm": 0.7670390009880066, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3420 + }, + { + "epoch": 3.34145153433999, + "grad_norm": 0.6654478311538696, + "learning_rate": 0.0002, + "loss": 1.4411, + "step": 3430 + }, + { + "epoch": 3.351193375547979, + "grad_norm": 0.6644385457038879, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 3440 + }, + { + "epoch": 3.360935216755967, + "grad_norm": 0.6974098086357117, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3450 + }, + { + "epoch": 3.370677057963955, + "grad_norm": 0.7350399494171143, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 3460 + }, + { + "epoch": 3.3804188991719437, + "grad_norm": 0.714721143245697, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 3470 + }, + { + "epoch": 3.390160740379932, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 3480 + }, + { + "epoch": 3.39990258158792, + "grad_norm": 0.6767925024032593, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3490 + }, + { + "epoch": 3.4096444227959086, + "grad_norm": 0.6721355319023132, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 3500 + }, + { + "epoch": 3.419386264003897, + "grad_norm": 0.6845725178718567, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3510 + }, + { + "epoch": 3.429128105211885, + "grad_norm": 0.6882196664810181, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 3520 + }, + { + "epoch": 3.4388699464198735, + "grad_norm": 0.7663240432739258, + "learning_rate": 0.0002, + "loss": 1.4962, + "step": 3530 + }, + { + "epoch": 3.4486117876278617, + "grad_norm": 0.6304219365119934, + "learning_rate": 0.0002, + "loss": 1.4644, + "step": 3540 + }, + { + "epoch": 3.45835362883585, + "grad_norm": 0.668678879737854, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3550 + }, + { + "epoch": 3.4680954700438384, + "grad_norm": 0.7526912093162537, + "learning_rate": 0.0002, + "loss": 1.4874, + "step": 3560 + }, + { + "epoch": 3.4778373112518266, + "grad_norm": 1.089495301246643, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 3570 + }, + { + "epoch": 3.4875791524598148, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 3580 + }, + { + "epoch": 3.4973209936678034, + "grad_norm": 0.6540156602859497, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3590 + }, + { + "epoch": 3.5070628348757915, + "grad_norm": 0.6449568867683411, + "learning_rate": 0.0002, + "loss": 1.4367, + "step": 3600 + }, + { + "epoch": 3.5168046760837797, + "grad_norm": 0.7262216210365295, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 3610 + }, + { + "epoch": 3.5265465172917683, + "grad_norm": 0.6048615574836731, + "learning_rate": 0.0002, + "loss": 1.4374, + "step": 3620 + }, + { + "epoch": 3.5362883584997564, + "grad_norm": 0.6780537366867065, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 3630 + }, + { + "epoch": 3.5460301997077446, + "grad_norm": 0.6851925253868103, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 3640 + }, + { + "epoch": 3.555772040915733, + "grad_norm": 0.6530634164810181, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3650 + }, + { + "epoch": 3.5655138821237213, + "grad_norm": 0.7193992733955383, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3660 + }, + { + "epoch": 3.5752557233317095, + "grad_norm": 0.767496645450592, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 3670 + }, + { + "epoch": 3.584997564539698, + "grad_norm": 0.6912919282913208, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 3680 + }, + { + "epoch": 3.5947394057476862, + "grad_norm": 0.7383436560630798, + "learning_rate": 0.0002, + "loss": 1.4497, + "step": 3690 + }, + { + "epoch": 3.6044812469556744, + "grad_norm": 0.6746662855148315, + "learning_rate": 0.0002, + "loss": 1.4822, + "step": 3700 + }, + { + "epoch": 3.614223088163663, + "grad_norm": 0.6885138750076294, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3710 + }, + { + "epoch": 3.623964929371651, + "grad_norm": 0.6694392561912537, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3720 + }, + { + "epoch": 3.6337067705796393, + "grad_norm": 0.812358021736145, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 3730 + }, + { + "epoch": 3.643448611787628, + "grad_norm": 0.7267130017280579, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 3740 + }, + { + "epoch": 3.653190452995616, + "grad_norm": 0.6958749294281006, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3750 + }, + { + "epoch": 3.6629322942036042, + "grad_norm": 0.6805673241615295, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3760 + }, + { + "epoch": 3.672674135411593, + "grad_norm": 0.7184410095214844, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3770 + }, + { + "epoch": 3.682415976619581, + "grad_norm": 0.7716330289840698, + "learning_rate": 0.0002, + "loss": 1.3935, + "step": 3780 + }, + { + "epoch": 3.6921578178275696, + "grad_norm": 0.6675831079483032, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3790 + }, + { + "epoch": 3.7018996590355577, + "grad_norm": 0.6480095386505127, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 3800 + }, + { + "epoch": 3.711641500243546, + "grad_norm": 0.6559418439865112, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 3810 + }, + { + "epoch": 3.7213833414515345, + "grad_norm": 0.6596545577049255, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 3820 + }, + { + "epoch": 3.7311251826595226, + "grad_norm": 0.7172950506210327, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3830 + }, + { + "epoch": 3.740867023867511, + "grad_norm": 0.796148419380188, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 3840 + }, + { + "epoch": 3.7506088650754994, + "grad_norm": 0.6600322723388672, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 3850 + }, + { + "epoch": 3.7603507062834876, + "grad_norm": 0.6776387691497803, + "learning_rate": 0.0002, + "loss": 1.4201, + "step": 3860 + }, + { + "epoch": 3.770092547491476, + "grad_norm": 0.7768304347991943, + "learning_rate": 0.0002, + "loss": 1.3893, + "step": 3870 + }, + { + "epoch": 3.7798343886994643, + "grad_norm": 1.0579794645309448, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 3880 + }, + { + "epoch": 3.7895762299074525, + "grad_norm": 0.6757252812385559, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 3890 + }, + { + "epoch": 3.799318071115441, + "grad_norm": 0.6706996560096741, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3900 + }, + { + "epoch": 3.809059912323429, + "grad_norm": 0.7026948928833008, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 3910 + }, + { + "epoch": 3.8188017535314174, + "grad_norm": 0.6437768340110779, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 3920 + }, + { + "epoch": 3.828543594739406, + "grad_norm": 0.7015706300735474, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 3930 + }, + { + "epoch": 3.838285435947394, + "grad_norm": 0.7049482464790344, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 3940 + }, + { + "epoch": 3.8480272771553823, + "grad_norm": 0.6533724665641785, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3950 + }, + { + "epoch": 3.857769118363371, + "grad_norm": 0.7312499284744263, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 3960 + }, + { + "epoch": 3.867510959571359, + "grad_norm": 0.6858801245689392, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3970 + }, + { + "epoch": 3.877252800779347, + "grad_norm": 0.770423173904419, + "learning_rate": 0.0002, + "loss": 1.4423, + "step": 3980 + }, + { + "epoch": 3.886994641987336, + "grad_norm": 0.6987539529800415, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 3990 + }, + { + "epoch": 3.896736483195324, + "grad_norm": 0.7072722315788269, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 4000 + }, + { + "epoch": 3.906478324403312, + "grad_norm": 0.6492931842803955, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4010 + }, + { + "epoch": 3.9162201656113007, + "grad_norm": 0.7716232538223267, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 4020 + }, + { + "epoch": 3.925962006819289, + "grad_norm": 0.722949743270874, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 4030 + }, + { + "epoch": 3.935703848027277, + "grad_norm": 0.7434365749359131, + "learning_rate": 0.0002, + "loss": 1.3914, + "step": 4040 + }, + { + "epoch": 3.9454456892352656, + "grad_norm": 0.6691509485244751, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 4050 + }, + { + "epoch": 3.9551875304432538, + "grad_norm": 0.6850284337997437, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4060 + }, + { + "epoch": 3.964929371651242, + "grad_norm": 0.6954452991485596, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4070 + }, + { + "epoch": 3.9746712128592305, + "grad_norm": 0.9316364526748657, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 4080 + }, + { + "epoch": 3.9844130540672187, + "grad_norm": 0.6908289194107056, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 4090 + }, + { + "epoch": 3.994154895275207, + "grad_norm": 0.666782021522522, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 4100 + }, + { + "epoch": 4.0, + "eval_loss": 1.9233275651931763, + "eval_runtime": 55.9536, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 1.144, + "step": 4106 + }, + { + "epoch": 4.003896736483195, + "grad_norm": 0.7726166248321533, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 4110 + }, + { + "epoch": 4.013638577691184, + "grad_norm": 1.1338967084884644, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 4120 + }, + { + "epoch": 4.023380418899172, + "grad_norm": 0.9530029296875, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 4130 + }, + { + "epoch": 4.03312226010716, + "grad_norm": 1.1058554649353027, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 4140 + }, + { + "epoch": 4.042864101315149, + "grad_norm": 0.8765049576759338, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 4150 + }, + { + "epoch": 4.052605942523137, + "grad_norm": 1.1774667501449585, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4160 + }, + { + "epoch": 4.062347783731125, + "grad_norm": 0.9301433563232422, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 4170 + }, + { + "epoch": 4.072089624939114, + "grad_norm": 1.0196778774261475, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 4180 + }, + { + "epoch": 4.081831466147102, + "grad_norm": 1.1380577087402344, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 4190 + }, + { + "epoch": 4.09157330735509, + "grad_norm": 0.9121319651603699, + "learning_rate": 0.0002, + "loss": 1.2521, + "step": 4200 + }, + { + "epoch": 4.101315148563079, + "grad_norm": 0.9495378732681274, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 4210 + }, + { + "epoch": 4.1110569897710665, + "grad_norm": 0.8058680295944214, + "learning_rate": 0.0002, + "loss": 1.1829, + "step": 4220 + }, + { + "epoch": 4.120798830979055, + "grad_norm": 1.000887393951416, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 4230 + }, + { + "epoch": 4.130540672187044, + "grad_norm": 0.9529102444648743, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 4240 + }, + { + "epoch": 4.140282513395031, + "grad_norm": 1.0257115364074707, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 4250 + }, + { + "epoch": 4.15002435460302, + "grad_norm": 0.9590303897857666, + "learning_rate": 0.0002, + "loss": 1.2293, + "step": 4260 + }, + { + "epoch": 4.159766195811009, + "grad_norm": 1.065291166305542, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 4270 + }, + { + "epoch": 4.169508037018996, + "grad_norm": 0.8819697499275208, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4280 + }, + { + "epoch": 4.179249878226985, + "grad_norm": 1.0335261821746826, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 4290 + }, + { + "epoch": 4.1889917194349735, + "grad_norm": 0.8872809410095215, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 4300 + }, + { + "epoch": 4.198733560642961, + "grad_norm": 0.9883159399032593, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 4310 + }, + { + "epoch": 4.20847540185095, + "grad_norm": 1.0254192352294922, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 4320 + }, + { + "epoch": 4.218217243058938, + "grad_norm": 0.9432600736618042, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 4330 + }, + { + "epoch": 4.227959084266926, + "grad_norm": 1.1008676290512085, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4340 + }, + { + "epoch": 4.237700925474915, + "grad_norm": 1.0829699039459229, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 4350 + }, + { + "epoch": 4.247442766682903, + "grad_norm": 1.016847848892212, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4360 + }, + { + "epoch": 4.257184607890891, + "grad_norm": 0.8924864530563354, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 4370 + }, + { + "epoch": 4.26692644909888, + "grad_norm": 0.9300530552864075, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 4380 + }, + { + "epoch": 4.276668290306868, + "grad_norm": 0.9684814810752869, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 4390 + }, + { + "epoch": 4.286410131514856, + "grad_norm": 0.9916250705718994, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 4400 + }, + { + "epoch": 4.2961519727228445, + "grad_norm": 0.903680145740509, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 4410 + }, + { + "epoch": 4.305893813930833, + "grad_norm": 0.8713505268096924, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 4420 + }, + { + "epoch": 4.315635655138821, + "grad_norm": 0.9983905553817749, + "learning_rate": 0.0002, + "loss": 1.1957, + "step": 4430 + }, + { + "epoch": 4.3253774963468095, + "grad_norm": 1.1689040660858154, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 4440 + }, + { + "epoch": 4.335119337554798, + "grad_norm": 0.9316853880882263, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 4450 + }, + { + "epoch": 4.344861178762786, + "grad_norm": 0.9175887107849121, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 4460 + }, + { + "epoch": 4.354603019970774, + "grad_norm": 0.9348906874656677, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4470 + }, + { + "epoch": 4.364344861178763, + "grad_norm": 0.9727016687393188, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 4480 + }, + { + "epoch": 4.374086702386751, + "grad_norm": 0.9843429923057556, + "learning_rate": 0.0002, + "loss": 1.2616, + "step": 4490 + }, + { + "epoch": 4.383828543594739, + "grad_norm": 0.9615852236747742, + "learning_rate": 0.0002, + "loss": 1.2488, + "step": 4500 + }, + { + "epoch": 4.393570384802728, + "grad_norm": 0.9688583612442017, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 4510 + }, + { + "epoch": 4.403312226010716, + "grad_norm": 0.9933668375015259, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 4520 + }, + { + "epoch": 4.413054067218704, + "grad_norm": 1.0626686811447144, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4530 + }, + { + "epoch": 4.422795908426693, + "grad_norm": 0.9536267518997192, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 4540 + }, + { + "epoch": 4.432537749634681, + "grad_norm": 0.9777140021324158, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4550 + }, + { + "epoch": 4.442279590842669, + "grad_norm": 0.980780839920044, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 4560 + }, + { + "epoch": 4.452021432050658, + "grad_norm": 1.0147196054458618, + "learning_rate": 0.0002, + "loss": 1.2597, + "step": 4570 + }, + { + "epoch": 4.461763273258645, + "grad_norm": 0.9763361811637878, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 4580 + }, + { + "epoch": 4.471505114466634, + "grad_norm": 1.0300798416137695, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 4590 + }, + { + "epoch": 4.481246955674623, + "grad_norm": 0.8833121657371521, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 4600 + }, + { + "epoch": 4.490988796882611, + "grad_norm": 1.1214020252227783, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 4610 + }, + { + "epoch": 4.500730638090599, + "grad_norm": 0.8843787908554077, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 4620 + }, + { + "epoch": 4.5104724792985875, + "grad_norm": 0.9942020773887634, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 4630 + }, + { + "epoch": 4.520214320506576, + "grad_norm": 1.0033202171325684, + "learning_rate": 0.0002, + "loss": 1.3172, + "step": 4640 + }, + { + "epoch": 4.529956161714564, + "grad_norm": 0.8767235279083252, + "learning_rate": 0.0002, + "loss": 1.2024, + "step": 4650 + }, + { + "epoch": 4.539698002922552, + "grad_norm": 1.0117276906967163, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 4660 + }, + { + "epoch": 4.549439844130541, + "grad_norm": 1.2787362337112427, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4670 + }, + { + "epoch": 4.559181685338529, + "grad_norm": 0.8824878931045532, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 4680 + }, + { + "epoch": 4.568923526546517, + "grad_norm": 0.9209560751914978, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4690 + }, + { + "epoch": 4.578665367754506, + "grad_norm": 1.1064010858535767, + "learning_rate": 0.0002, + "loss": 1.1916, + "step": 4700 + }, + { + "epoch": 4.588407208962494, + "grad_norm": 0.8914572596549988, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 4710 + }, + { + "epoch": 4.598149050170482, + "grad_norm": 1.0412265062332153, + "learning_rate": 0.0002, + "loss": 1.2861, + "step": 4720 + }, + { + "epoch": 4.607890891378471, + "grad_norm": 1.1950221061706543, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4730 + }, + { + "epoch": 4.617632732586459, + "grad_norm": 0.8938062787055969, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 4740 + }, + { + "epoch": 4.627374573794447, + "grad_norm": 0.9849569201469421, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4750 + }, + { + "epoch": 4.637116415002436, + "grad_norm": 1.0081515312194824, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4760 + }, + { + "epoch": 4.6468582562104235, + "grad_norm": 0.8566309213638306, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 4770 + }, + { + "epoch": 4.656600097418412, + "grad_norm": 1.1750118732452393, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 4780 + }, + { + "epoch": 4.666341938626401, + "grad_norm": 0.925502598285675, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 4790 + }, + { + "epoch": 4.676083779834388, + "grad_norm": 1.0402472019195557, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 4800 + }, + { + "epoch": 4.685825621042377, + "grad_norm": 0.9772472977638245, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 4810 + }, + { + "epoch": 4.695567462250366, + "grad_norm": 0.9082779288291931, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 4820 + }, + { + "epoch": 4.705309303458353, + "grad_norm": 0.8026862740516663, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 4830 + }, + { + "epoch": 4.715051144666342, + "grad_norm": 1.1631089448928833, + "learning_rate": 0.0002, + "loss": 1.3369, + "step": 4840 + }, + { + "epoch": 4.7247929858743305, + "grad_norm": 0.9384787678718567, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4850 + }, + { + "epoch": 4.734534827082318, + "grad_norm": 1.2151581048965454, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 4860 + }, + { + "epoch": 4.744276668290307, + "grad_norm": 0.9679436087608337, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 4870 + }, + { + "epoch": 4.754018509498295, + "grad_norm": 0.8352158069610596, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 4880 + }, + { + "epoch": 4.763760350706283, + "grad_norm": 1.0205804109573364, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 4890 + }, + { + "epoch": 4.773502191914272, + "grad_norm": 0.9814772605895996, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 4900 + }, + { + "epoch": 4.78324403312226, + "grad_norm": 1.002854347229004, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 4910 + }, + { + "epoch": 4.792985874330248, + "grad_norm": 1.1609505414962769, + "learning_rate": 0.0002, + "loss": 1.3143, + "step": 4920 + }, + { + "epoch": 4.802727715538237, + "grad_norm": 0.9354982376098633, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 4930 + }, + { + "epoch": 4.812469556746225, + "grad_norm": 0.9761685729026794, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 4940 + }, + { + "epoch": 4.822211397954213, + "grad_norm": 1.0604596138000488, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 4950 + }, + { + "epoch": 4.8319532391622015, + "grad_norm": 1.0902808904647827, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 4960 + }, + { + "epoch": 4.84169508037019, + "grad_norm": 1.0174955129623413, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4970 + }, + { + "epoch": 4.851436921578179, + "grad_norm": 1.0995253324508667, + "learning_rate": 0.0002, + "loss": 1.3141, + "step": 4980 + }, + { + "epoch": 4.8611787627861665, + "grad_norm": 0.880993127822876, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4990 + }, + { + "epoch": 4.870920603994155, + "grad_norm": 0.9472237825393677, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 5000 + }, + { + "epoch": 4.880662445202143, + "grad_norm": 0.9504236578941345, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5010 + }, + { + "epoch": 4.890404286410131, + "grad_norm": 1.1261742115020752, + "learning_rate": 0.0002, + "loss": 1.2791, + "step": 5020 + }, + { + "epoch": 4.90014612761812, + "grad_norm": 0.904674768447876, + "learning_rate": 0.0002, + "loss": 1.3707, + "step": 5030 + }, + { + "epoch": 4.909887968826109, + "grad_norm": 0.8828991055488586, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5040 + }, + { + "epoch": 4.919629810034096, + "grad_norm": 1.0156532526016235, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 5050 + }, + { + "epoch": 4.929371651242085, + "grad_norm": 0.8975168466567993, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 5060 + }, + { + "epoch": 4.939113492450073, + "grad_norm": 0.9787213802337646, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 5070 + }, + { + "epoch": 4.948855333658061, + "grad_norm": 1.0801568031311035, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 5080 + }, + { + "epoch": 4.95859717486605, + "grad_norm": 1.0655089616775513, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 5090 + }, + { + "epoch": 4.968339016074038, + "grad_norm": 0.8941320180892944, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 5100 + }, + { + "epoch": 4.978080857282026, + "grad_norm": 1.050621747970581, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 5110 + }, + { + "epoch": 4.987822698490015, + "grad_norm": 0.9724781513214111, + "learning_rate": 0.0002, + "loss": 1.3791, + "step": 5120 + }, + { + "epoch": 4.997564539698003, + "grad_norm": 0.9850538969039917, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 5130 + }, + { + "epoch": 4.9995129079396, + "eval_loss": 2.0824170112609863, + "eval_runtime": 55.592, + "eval_samples_per_second": 9.12, + "eval_steps_per_second": 1.151, + "step": 5132 + }, + { + "epoch": 5.007306380905991, + "grad_norm": 1.0096189975738525, + "learning_rate": 0.0002, + "loss": 1.037, + "step": 5140 + }, + { + "epoch": 5.01704822211398, + "grad_norm": 1.2403408288955688, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 5150 + }, + { + "epoch": 5.026790063321968, + "grad_norm": 1.1243221759796143, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 5160 + }, + { + "epoch": 5.036531904529956, + "grad_norm": 1.4745502471923828, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 5170 + }, + { + "epoch": 5.0462737457379445, + "grad_norm": 1.1913198232650757, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 5180 + }, + { + "epoch": 5.056015586945933, + "grad_norm": 1.2732855081558228, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 5190 + }, + { + "epoch": 5.065757428153921, + "grad_norm": 1.1737396717071533, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 5200 + }, + { + "epoch": 5.075499269361909, + "grad_norm": 1.4162768125534058, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 5210 + }, + { + "epoch": 5.085241110569898, + "grad_norm": 1.528274655342102, + "learning_rate": 0.0002, + "loss": 1.0333, + "step": 5220 + }, + { + "epoch": 5.094982951777886, + "grad_norm": 1.3966618776321411, + "learning_rate": 0.0002, + "loss": 1.0227, + "step": 5230 + }, + { + "epoch": 5.104724792985874, + "grad_norm": 1.3427953720092773, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 5240 + }, + { + "epoch": 5.114466634193863, + "grad_norm": 1.6533905267715454, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 5250 + }, + { + "epoch": 5.124208475401851, + "grad_norm": 1.4114865064620972, + "learning_rate": 0.0002, + "loss": 1.0452, + "step": 5260 + }, + { + "epoch": 5.133950316609839, + "grad_norm": 1.5460708141326904, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 5270 + }, + { + "epoch": 5.143692157817828, + "grad_norm": 1.3491919040679932, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5280 + }, + { + "epoch": 5.153433999025816, + "grad_norm": 1.2208969593048096, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 5290 + }, + { + "epoch": 5.163175840233804, + "grad_norm": 1.1141403913497925, + "learning_rate": 0.0002, + "loss": 1.0362, + "step": 5300 + }, + { + "epoch": 5.172917681441793, + "grad_norm": 1.2938064336776733, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 5310 + }, + { + "epoch": 5.1826595226497805, + "grad_norm": 1.2704918384552002, + "learning_rate": 0.0002, + "loss": 1.0438, + "step": 5320 + }, + { + "epoch": 5.192401363857769, + "grad_norm": 1.3928544521331787, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 5330 + }, + { + "epoch": 5.202143205065758, + "grad_norm": 1.1993824243545532, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 5340 + }, + { + "epoch": 5.211885046273745, + "grad_norm": 1.5913670063018799, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 5350 + }, + { + "epoch": 5.221626887481734, + "grad_norm": 1.1577855348587036, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 5360 + }, + { + "epoch": 5.231368728689723, + "grad_norm": 1.4535993337631226, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 5370 + }, + { + "epoch": 5.24111056989771, + "grad_norm": 1.5068976879119873, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 5380 + }, + { + "epoch": 5.250852411105699, + "grad_norm": 1.2365459203720093, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 5390 + }, + { + "epoch": 5.2605942523136875, + "grad_norm": 1.3197922706604004, + "learning_rate": 0.0002, + "loss": 1.0145, + "step": 5400 + }, + { + "epoch": 5.270336093521675, + "grad_norm": 1.2395117282867432, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 5410 + }, + { + "epoch": 5.280077934729664, + "grad_norm": 1.1841236352920532, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 5420 + }, + { + "epoch": 5.289819775937652, + "grad_norm": 1.218003749847412, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 5430 + }, + { + "epoch": 5.29956161714564, + "grad_norm": 1.2210947275161743, + "learning_rate": 0.0002, + "loss": 1.0093, + "step": 5440 + }, + { + "epoch": 5.309303458353629, + "grad_norm": 1.266006588935852, + "learning_rate": 0.0002, + "loss": 0.9619, + "step": 5450 + }, + { + "epoch": 5.319045299561617, + "grad_norm": 1.2598075866699219, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 5460 + }, + { + "epoch": 5.328787140769606, + "grad_norm": 1.2410019636154175, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 5470 + }, + { + "epoch": 5.338528981977594, + "grad_norm": 1.249698519706726, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 5480 + }, + { + "epoch": 5.348270823185582, + "grad_norm": 1.2398173809051514, + "learning_rate": 0.0002, + "loss": 1.0457, + "step": 5490 + }, + { + "epoch": 5.35801266439357, + "grad_norm": 1.2416654825210571, + "learning_rate": 0.0002, + "loss": 1.0139, + "step": 5500 + }, + { + "epoch": 5.3677545056015585, + "grad_norm": 1.398706316947937, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 5510 + }, + { + "epoch": 5.377496346809547, + "grad_norm": 1.3049418926239014, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 5520 + }, + { + "epoch": 5.387238188017536, + "grad_norm": 1.2528893947601318, + "learning_rate": 0.0002, + "loss": 1.0912, + "step": 5530 + }, + { + "epoch": 5.3969800292255234, + "grad_norm": 1.2963255643844604, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 5540 + }, + { + "epoch": 5.406721870433512, + "grad_norm": 1.494231104850769, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 5550 + }, + { + "epoch": 5.416463711641501, + "grad_norm": 1.2760992050170898, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 5560 + }, + { + "epoch": 5.426205552849488, + "grad_norm": 1.195292592048645, + "learning_rate": 0.0002, + "loss": 1.1088, + "step": 5570 + }, + { + "epoch": 5.435947394057477, + "grad_norm": 1.6408965587615967, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 5580 + }, + { + "epoch": 5.4456892352654656, + "grad_norm": 1.3092058897018433, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 5590 + }, + { + "epoch": 5.455431076473453, + "grad_norm": 1.2960586547851562, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 5600 + }, + { + "epoch": 5.465172917681442, + "grad_norm": 1.3560487031936646, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 5610 + }, + { + "epoch": 5.4749147588894305, + "grad_norm": 1.1896311044692993, + "learning_rate": 0.0002, + "loss": 1.0314, + "step": 5620 + }, + { + "epoch": 5.484656600097418, + "grad_norm": 1.3145595788955688, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 5630 + }, + { + "epoch": 5.494398441305407, + "grad_norm": 1.2207404375076294, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 5640 + }, + { + "epoch": 5.504140282513395, + "grad_norm": 1.266015887260437, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 5650 + }, + { + "epoch": 5.513882123721383, + "grad_norm": 1.2478289604187012, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 5660 + }, + { + "epoch": 5.523623964929372, + "grad_norm": 1.4851372241973877, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 5670 + }, + { + "epoch": 5.53336580613736, + "grad_norm": 1.4478679895401, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5680 + }, + { + "epoch": 5.543107647345348, + "grad_norm": 1.1079537868499756, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 5690 + }, + { + "epoch": 5.552849488553337, + "grad_norm": 1.4201879501342773, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 5700 + }, + { + "epoch": 5.562591329761325, + "grad_norm": 1.2092000246047974, + "learning_rate": 0.0002, + "loss": 1.0697, + "step": 5710 + }, + { + "epoch": 5.572333170969313, + "grad_norm": 1.4515851736068726, + "learning_rate": 0.0002, + "loss": 0.9868, + "step": 5720 + }, + { + "epoch": 5.5820750121773015, + "grad_norm": 1.3260412216186523, + "learning_rate": 0.0002, + "loss": 1.1547, + "step": 5730 + }, + { + "epoch": 5.59181685338529, + "grad_norm": 1.248191475868225, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 5740 + }, + { + "epoch": 5.601558694593278, + "grad_norm": 1.2037307024002075, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 5750 + }, + { + "epoch": 5.611300535801266, + "grad_norm": 1.341237187385559, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 5760 + }, + { + "epoch": 5.621042377009255, + "grad_norm": 1.130115270614624, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 5770 + }, + { + "epoch": 5.630784218217243, + "grad_norm": 1.3834772109985352, + "learning_rate": 0.0002, + "loss": 1.1029, + "step": 5780 + }, + { + "epoch": 5.640526059425231, + "grad_norm": 1.2586270570755005, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 5790 + }, + { + "epoch": 5.65026790063322, + "grad_norm": 1.3233023881912231, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 5800 + }, + { + "epoch": 5.660009741841208, + "grad_norm": 1.2711341381072998, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 5810 + }, + { + "epoch": 5.669751583049196, + "grad_norm": 1.3867720365524292, + "learning_rate": 0.0002, + "loss": 1.0897, + "step": 5820 + }, + { + "epoch": 5.679493424257185, + "grad_norm": 1.4783269166946411, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 5830 + }, + { + "epoch": 5.6892352654651726, + "grad_norm": 1.2744768857955933, + "learning_rate": 0.0002, + "loss": 1.0632, + "step": 5840 + }, + { + "epoch": 5.698977106673161, + "grad_norm": 1.3405882120132446, + "learning_rate": 0.0002, + "loss": 1.1484, + "step": 5850 + }, + { + "epoch": 5.70871894788115, + "grad_norm": 1.204300880432129, + "learning_rate": 0.0002, + "loss": 1.0975, + "step": 5860 + }, + { + "epoch": 5.7184607890891375, + "grad_norm": 1.2954572439193726, + "learning_rate": 0.0002, + "loss": 1.0494, + "step": 5870 + }, + { + "epoch": 5.728202630297126, + "grad_norm": 1.5478382110595703, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 5880 + }, + { + "epoch": 5.737944471505115, + "grad_norm": 1.2095842361450195, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 5890 + }, + { + "epoch": 5.747686312713103, + "grad_norm": 1.0691519975662231, + "learning_rate": 0.0002, + "loss": 1.1, + "step": 5900 + }, + { + "epoch": 5.757428153921091, + "grad_norm": 1.1920677423477173, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 5910 + }, + { + "epoch": 5.76716999512908, + "grad_norm": 1.2051277160644531, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 5920 + }, + { + "epoch": 5.776911836337067, + "grad_norm": 1.197490930557251, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 5930 + }, + { + "epoch": 5.786653677545056, + "grad_norm": 1.2003998756408691, + "learning_rate": 0.0002, + "loss": 1.07, + "step": 5940 + }, + { + "epoch": 5.7963955187530445, + "grad_norm": 1.2323646545410156, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 5950 + }, + { + "epoch": 5.806137359961033, + "grad_norm": 1.2593932151794434, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 5960 + }, + { + "epoch": 5.815879201169021, + "grad_norm": 1.1835976839065552, + "learning_rate": 0.0002, + "loss": 1.0829, + "step": 5970 + }, + { + "epoch": 5.825621042377009, + "grad_norm": 1.4770104885101318, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 5980 + }, + { + "epoch": 5.835362883584997, + "grad_norm": 1.1025809049606323, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 5990 + }, + { + "epoch": 5.845104724792986, + "grad_norm": 1.364588975906372, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 6000 + }, + { + "epoch": 5.854846566000974, + "grad_norm": 1.2340112924575806, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 6010 + }, + { + "epoch": 5.864588407208963, + "grad_norm": 1.4925711154937744, + "learning_rate": 0.0002, + "loss": 1.1123, + "step": 6020 + }, + { + "epoch": 5.874330248416951, + "grad_norm": 1.3516744375228882, + "learning_rate": 0.0002, + "loss": 1.12, + "step": 6030 + }, + { + "epoch": 5.884072089624939, + "grad_norm": 1.2058138847351074, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 6040 + }, + { + "epoch": 5.893813930832927, + "grad_norm": 1.13870108127594, + "learning_rate": 0.0002, + "loss": 1.1074, + "step": 6050 + }, + { + "epoch": 5.9035557720409155, + "grad_norm": 1.1587319374084473, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 6060 + }, + { + "epoch": 5.913297613248904, + "grad_norm": 1.164481520652771, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 6070 + }, + { + "epoch": 5.923039454456893, + "grad_norm": 1.2115206718444824, + "learning_rate": 0.0002, + "loss": 1.1262, + "step": 6080 + }, + { + "epoch": 5.93278129566488, + "grad_norm": 1.3201590776443481, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 6090 + }, + { + "epoch": 5.942523136872869, + "grad_norm": 1.287380576133728, + "learning_rate": 0.0002, + "loss": 1.1288, + "step": 6100 + }, + { + "epoch": 5.952264978080858, + "grad_norm": 1.1820166110992432, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6110 + }, + { + "epoch": 5.962006819288845, + "grad_norm": 1.2550667524337769, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 6120 + }, + { + "epoch": 5.971748660496834, + "grad_norm": 1.3547813892364502, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 6130 + }, + { + "epoch": 5.9814905017048225, + "grad_norm": 1.260842204093933, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 6140 + }, + { + "epoch": 5.99123234291281, + "grad_norm": 1.1643036603927612, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 6150 + }, + { + "epoch": 6.0, + "eval_loss": 2.2628161907196045, + "eval_runtime": 57.2379, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 1.118, + "step": 6159 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.1634930932383744e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-6159/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a476254c5476ecf662100e81977dbff8e31af0b8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad21b406fa7ff475f2aacc870dc56a79db5aa11115489ef6c48349e78b13267 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0b3a7055a8064cadb3433d54e50ff81e4c28b2e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb60d96f48eec71564d7294f0ee00a6eee0d4c8fc2832b91a65336aed1c90bc +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd39b7bf319198de21b9e2a498f3818eaf87d824 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c177f26bb6249de2a1cc5290b39f6a754b9ac519df5f3be6f7e8066aa6640f0 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0512da751f5b1afd057bc1deacf2719d3925e537 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e4edaec93b10616e7d2a79bcf0bd5b1c26686d61184d25e7d0a4b9ea9617362 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76679bf68796a18e3b82a06687d1b36221aae8b2 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/trainer_state.json @@ -0,0 +1,5115 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 6.9995129079396, + "eval_steps": 10, + "global_step": 7185, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + }, + { + "epoch": 3.0004870920603994, + "grad_norm": 0.49992576241493225, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3080 + }, + { + "epoch": 3.0102289332683876, + "grad_norm": 0.8242783546447754, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3090 + }, + { + "epoch": 3.019970774476376, + "grad_norm": 0.6330569386482239, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3100 + }, + { + "epoch": 3.0297126156843643, + "grad_norm": 0.566097617149353, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 3110 + }, + { + "epoch": 3.0394544568923525, + "grad_norm": 0.6337586045265198, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 3120 + }, + { + "epoch": 3.049196298100341, + "grad_norm": 0.7339403033256531, + "learning_rate": 0.0002, + "loss": 1.3916, + "step": 3130 + }, + { + "epoch": 3.0589381393083293, + "grad_norm": 0.7187346816062927, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 3140 + }, + { + "epoch": 3.0686799805163174, + "grad_norm": 0.7116255760192871, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3150 + }, + { + "epoch": 3.078421821724306, + "grad_norm": 0.6493807435035706, + "learning_rate": 0.0002, + "loss": 1.4452, + "step": 3160 + }, + { + "epoch": 3.088163662932294, + "grad_norm": 0.6777266263961792, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 3170 + }, + { + "epoch": 3.0979055041402823, + "grad_norm": 0.6342006325721741, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 3180 + }, + { + "epoch": 3.107647345348271, + "grad_norm": 0.6608964204788208, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 3190 + }, + { + "epoch": 3.117389186556259, + "grad_norm": 0.7230247259140015, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 3200 + }, + { + "epoch": 3.1271310277642472, + "grad_norm": 0.650368332862854, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 3210 + }, + { + "epoch": 3.136872868972236, + "grad_norm": 0.7319342494010925, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 3220 + }, + { + "epoch": 3.146614710180224, + "grad_norm": 0.7159963846206665, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 3230 + }, + { + "epoch": 3.156356551388212, + "grad_norm": 0.8905230164527893, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 3240 + }, + { + "epoch": 3.1660983925962007, + "grad_norm": 0.6920804381370544, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 3250 + }, + { + "epoch": 3.175840233804189, + "grad_norm": 0.6782063841819763, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 3260 + }, + { + "epoch": 3.1855820750121775, + "grad_norm": 0.735325276851654, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3270 + }, + { + "epoch": 3.1953239162201656, + "grad_norm": 0.6657978296279907, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 3280 + }, + { + "epoch": 3.205065757428154, + "grad_norm": 0.771315336227417, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 3290 + }, + { + "epoch": 3.2148075986361424, + "grad_norm": 0.6492983102798462, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 3300 + }, + { + "epoch": 3.2245494398441306, + "grad_norm": 0.7513770461082458, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3310 + }, + { + "epoch": 3.2342912810521187, + "grad_norm": 0.7091423869132996, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 3320 + }, + { + "epoch": 3.2440331222601073, + "grad_norm": 0.6663975119590759, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 3330 + }, + { + "epoch": 3.2537749634680955, + "grad_norm": 0.6813122034072876, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 3340 + }, + { + "epoch": 3.2635168046760836, + "grad_norm": 0.6602569818496704, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 3350 + }, + { + "epoch": 3.2732586458840722, + "grad_norm": 0.718270480632782, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 3360 + }, + { + "epoch": 3.2830004870920604, + "grad_norm": 0.6884173154830933, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 3370 + }, + { + "epoch": 3.2927423283000485, + "grad_norm": 0.7039775848388672, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3380 + }, + { + "epoch": 3.302484169508037, + "grad_norm": 0.7444299459457397, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3390 + }, + { + "epoch": 3.3122260107160253, + "grad_norm": 0.7187064290046692, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 3400 + }, + { + "epoch": 3.3219678519240134, + "grad_norm": 0.599396288394928, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3410 + }, + { + "epoch": 3.331709693132002, + "grad_norm": 0.7670390009880066, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3420 + }, + { + "epoch": 3.34145153433999, + "grad_norm": 0.6654478311538696, + "learning_rate": 0.0002, + "loss": 1.4411, + "step": 3430 + }, + { + "epoch": 3.351193375547979, + "grad_norm": 0.6644385457038879, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 3440 + }, + { + "epoch": 3.360935216755967, + "grad_norm": 0.6974098086357117, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3450 + }, + { + "epoch": 3.370677057963955, + "grad_norm": 0.7350399494171143, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 3460 + }, + { + "epoch": 3.3804188991719437, + "grad_norm": 0.714721143245697, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 3470 + }, + { + "epoch": 3.390160740379932, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 3480 + }, + { + "epoch": 3.39990258158792, + "grad_norm": 0.6767925024032593, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3490 + }, + { + "epoch": 3.4096444227959086, + "grad_norm": 0.6721355319023132, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 3500 + }, + { + "epoch": 3.419386264003897, + "grad_norm": 0.6845725178718567, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3510 + }, + { + "epoch": 3.429128105211885, + "grad_norm": 0.6882196664810181, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 3520 + }, + { + "epoch": 3.4388699464198735, + "grad_norm": 0.7663240432739258, + "learning_rate": 0.0002, + "loss": 1.4962, + "step": 3530 + }, + { + "epoch": 3.4486117876278617, + "grad_norm": 0.6304219365119934, + "learning_rate": 0.0002, + "loss": 1.4644, + "step": 3540 + }, + { + "epoch": 3.45835362883585, + "grad_norm": 0.668678879737854, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3550 + }, + { + "epoch": 3.4680954700438384, + "grad_norm": 0.7526912093162537, + "learning_rate": 0.0002, + "loss": 1.4874, + "step": 3560 + }, + { + "epoch": 3.4778373112518266, + "grad_norm": 1.089495301246643, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 3570 + }, + { + "epoch": 3.4875791524598148, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 3580 + }, + { + "epoch": 3.4973209936678034, + "grad_norm": 0.6540156602859497, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3590 + }, + { + "epoch": 3.5070628348757915, + "grad_norm": 0.6449568867683411, + "learning_rate": 0.0002, + "loss": 1.4367, + "step": 3600 + }, + { + "epoch": 3.5168046760837797, + "grad_norm": 0.7262216210365295, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 3610 + }, + { + "epoch": 3.5265465172917683, + "grad_norm": 0.6048615574836731, + "learning_rate": 0.0002, + "loss": 1.4374, + "step": 3620 + }, + { + "epoch": 3.5362883584997564, + "grad_norm": 0.6780537366867065, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 3630 + }, + { + "epoch": 3.5460301997077446, + "grad_norm": 0.6851925253868103, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 3640 + }, + { + "epoch": 3.555772040915733, + "grad_norm": 0.6530634164810181, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3650 + }, + { + "epoch": 3.5655138821237213, + "grad_norm": 0.7193992733955383, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3660 + }, + { + "epoch": 3.5752557233317095, + "grad_norm": 0.767496645450592, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 3670 + }, + { + "epoch": 3.584997564539698, + "grad_norm": 0.6912919282913208, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 3680 + }, + { + "epoch": 3.5947394057476862, + "grad_norm": 0.7383436560630798, + "learning_rate": 0.0002, + "loss": 1.4497, + "step": 3690 + }, + { + "epoch": 3.6044812469556744, + "grad_norm": 0.6746662855148315, + "learning_rate": 0.0002, + "loss": 1.4822, + "step": 3700 + }, + { + "epoch": 3.614223088163663, + "grad_norm": 0.6885138750076294, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3710 + }, + { + "epoch": 3.623964929371651, + "grad_norm": 0.6694392561912537, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3720 + }, + { + "epoch": 3.6337067705796393, + "grad_norm": 0.812358021736145, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 3730 + }, + { + "epoch": 3.643448611787628, + "grad_norm": 0.7267130017280579, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 3740 + }, + { + "epoch": 3.653190452995616, + "grad_norm": 0.6958749294281006, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3750 + }, + { + "epoch": 3.6629322942036042, + "grad_norm": 0.6805673241615295, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3760 + }, + { + "epoch": 3.672674135411593, + "grad_norm": 0.7184410095214844, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3770 + }, + { + "epoch": 3.682415976619581, + "grad_norm": 0.7716330289840698, + "learning_rate": 0.0002, + "loss": 1.3935, + "step": 3780 + }, + { + "epoch": 3.6921578178275696, + "grad_norm": 0.6675831079483032, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3790 + }, + { + "epoch": 3.7018996590355577, + "grad_norm": 0.6480095386505127, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 3800 + }, + { + "epoch": 3.711641500243546, + "grad_norm": 0.6559418439865112, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 3810 + }, + { + "epoch": 3.7213833414515345, + "grad_norm": 0.6596545577049255, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 3820 + }, + { + "epoch": 3.7311251826595226, + "grad_norm": 0.7172950506210327, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3830 + }, + { + "epoch": 3.740867023867511, + "grad_norm": 0.796148419380188, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 3840 + }, + { + "epoch": 3.7506088650754994, + "grad_norm": 0.6600322723388672, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 3850 + }, + { + "epoch": 3.7603507062834876, + "grad_norm": 0.6776387691497803, + "learning_rate": 0.0002, + "loss": 1.4201, + "step": 3860 + }, + { + "epoch": 3.770092547491476, + "grad_norm": 0.7768304347991943, + "learning_rate": 0.0002, + "loss": 1.3893, + "step": 3870 + }, + { + "epoch": 3.7798343886994643, + "grad_norm": 1.0579794645309448, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 3880 + }, + { + "epoch": 3.7895762299074525, + "grad_norm": 0.6757252812385559, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 3890 + }, + { + "epoch": 3.799318071115441, + "grad_norm": 0.6706996560096741, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3900 + }, + { + "epoch": 3.809059912323429, + "grad_norm": 0.7026948928833008, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 3910 + }, + { + "epoch": 3.8188017535314174, + "grad_norm": 0.6437768340110779, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 3920 + }, + { + "epoch": 3.828543594739406, + "grad_norm": 0.7015706300735474, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 3930 + }, + { + "epoch": 3.838285435947394, + "grad_norm": 0.7049482464790344, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 3940 + }, + { + "epoch": 3.8480272771553823, + "grad_norm": 0.6533724665641785, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3950 + }, + { + "epoch": 3.857769118363371, + "grad_norm": 0.7312499284744263, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 3960 + }, + { + "epoch": 3.867510959571359, + "grad_norm": 0.6858801245689392, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3970 + }, + { + "epoch": 3.877252800779347, + "grad_norm": 0.770423173904419, + "learning_rate": 0.0002, + "loss": 1.4423, + "step": 3980 + }, + { + "epoch": 3.886994641987336, + "grad_norm": 0.6987539529800415, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 3990 + }, + { + "epoch": 3.896736483195324, + "grad_norm": 0.7072722315788269, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 4000 + }, + { + "epoch": 3.906478324403312, + "grad_norm": 0.6492931842803955, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4010 + }, + { + "epoch": 3.9162201656113007, + "grad_norm": 0.7716232538223267, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 4020 + }, + { + "epoch": 3.925962006819289, + "grad_norm": 0.722949743270874, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 4030 + }, + { + "epoch": 3.935703848027277, + "grad_norm": 0.7434365749359131, + "learning_rate": 0.0002, + "loss": 1.3914, + "step": 4040 + }, + { + "epoch": 3.9454456892352656, + "grad_norm": 0.6691509485244751, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 4050 + }, + { + "epoch": 3.9551875304432538, + "grad_norm": 0.6850284337997437, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4060 + }, + { + "epoch": 3.964929371651242, + "grad_norm": 0.6954452991485596, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4070 + }, + { + "epoch": 3.9746712128592305, + "grad_norm": 0.9316364526748657, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 4080 + }, + { + "epoch": 3.9844130540672187, + "grad_norm": 0.6908289194107056, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 4090 + }, + { + "epoch": 3.994154895275207, + "grad_norm": 0.666782021522522, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 4100 + }, + { + "epoch": 4.0, + "eval_loss": 1.9233275651931763, + "eval_runtime": 55.9536, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 1.144, + "step": 4106 + }, + { + "epoch": 4.003896736483195, + "grad_norm": 0.7726166248321533, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 4110 + }, + { + "epoch": 4.013638577691184, + "grad_norm": 1.1338967084884644, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 4120 + }, + { + "epoch": 4.023380418899172, + "grad_norm": 0.9530029296875, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 4130 + }, + { + "epoch": 4.03312226010716, + "grad_norm": 1.1058554649353027, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 4140 + }, + { + "epoch": 4.042864101315149, + "grad_norm": 0.8765049576759338, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 4150 + }, + { + "epoch": 4.052605942523137, + "grad_norm": 1.1774667501449585, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4160 + }, + { + "epoch": 4.062347783731125, + "grad_norm": 0.9301433563232422, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 4170 + }, + { + "epoch": 4.072089624939114, + "grad_norm": 1.0196778774261475, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 4180 + }, + { + "epoch": 4.081831466147102, + "grad_norm": 1.1380577087402344, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 4190 + }, + { + "epoch": 4.09157330735509, + "grad_norm": 0.9121319651603699, + "learning_rate": 0.0002, + "loss": 1.2521, + "step": 4200 + }, + { + "epoch": 4.101315148563079, + "grad_norm": 0.9495378732681274, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 4210 + }, + { + "epoch": 4.1110569897710665, + "grad_norm": 0.8058680295944214, + "learning_rate": 0.0002, + "loss": 1.1829, + "step": 4220 + }, + { + "epoch": 4.120798830979055, + "grad_norm": 1.000887393951416, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 4230 + }, + { + "epoch": 4.130540672187044, + "grad_norm": 0.9529102444648743, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 4240 + }, + { + "epoch": 4.140282513395031, + "grad_norm": 1.0257115364074707, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 4250 + }, + { + "epoch": 4.15002435460302, + "grad_norm": 0.9590303897857666, + "learning_rate": 0.0002, + "loss": 1.2293, + "step": 4260 + }, + { + "epoch": 4.159766195811009, + "grad_norm": 1.065291166305542, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 4270 + }, + { + "epoch": 4.169508037018996, + "grad_norm": 0.8819697499275208, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4280 + }, + { + "epoch": 4.179249878226985, + "grad_norm": 1.0335261821746826, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 4290 + }, + { + "epoch": 4.1889917194349735, + "grad_norm": 0.8872809410095215, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 4300 + }, + { + "epoch": 4.198733560642961, + "grad_norm": 0.9883159399032593, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 4310 + }, + { + "epoch": 4.20847540185095, + "grad_norm": 1.0254192352294922, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 4320 + }, + { + "epoch": 4.218217243058938, + "grad_norm": 0.9432600736618042, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 4330 + }, + { + "epoch": 4.227959084266926, + "grad_norm": 1.1008676290512085, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4340 + }, + { + "epoch": 4.237700925474915, + "grad_norm": 1.0829699039459229, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 4350 + }, + { + "epoch": 4.247442766682903, + "grad_norm": 1.016847848892212, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4360 + }, + { + "epoch": 4.257184607890891, + "grad_norm": 0.8924864530563354, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 4370 + }, + { + "epoch": 4.26692644909888, + "grad_norm": 0.9300530552864075, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 4380 + }, + { + "epoch": 4.276668290306868, + "grad_norm": 0.9684814810752869, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 4390 + }, + { + "epoch": 4.286410131514856, + "grad_norm": 0.9916250705718994, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 4400 + }, + { + "epoch": 4.2961519727228445, + "grad_norm": 0.903680145740509, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 4410 + }, + { + "epoch": 4.305893813930833, + "grad_norm": 0.8713505268096924, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 4420 + }, + { + "epoch": 4.315635655138821, + "grad_norm": 0.9983905553817749, + "learning_rate": 0.0002, + "loss": 1.1957, + "step": 4430 + }, + { + "epoch": 4.3253774963468095, + "grad_norm": 1.1689040660858154, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 4440 + }, + { + "epoch": 4.335119337554798, + "grad_norm": 0.9316853880882263, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 4450 + }, + { + "epoch": 4.344861178762786, + "grad_norm": 0.9175887107849121, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 4460 + }, + { + "epoch": 4.354603019970774, + "grad_norm": 0.9348906874656677, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4470 + }, + { + "epoch": 4.364344861178763, + "grad_norm": 0.9727016687393188, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 4480 + }, + { + "epoch": 4.374086702386751, + "grad_norm": 0.9843429923057556, + "learning_rate": 0.0002, + "loss": 1.2616, + "step": 4490 + }, + { + "epoch": 4.383828543594739, + "grad_norm": 0.9615852236747742, + "learning_rate": 0.0002, + "loss": 1.2488, + "step": 4500 + }, + { + "epoch": 4.393570384802728, + "grad_norm": 0.9688583612442017, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 4510 + }, + { + "epoch": 4.403312226010716, + "grad_norm": 0.9933668375015259, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 4520 + }, + { + "epoch": 4.413054067218704, + "grad_norm": 1.0626686811447144, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4530 + }, + { + "epoch": 4.422795908426693, + "grad_norm": 0.9536267518997192, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 4540 + }, + { + "epoch": 4.432537749634681, + "grad_norm": 0.9777140021324158, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4550 + }, + { + "epoch": 4.442279590842669, + "grad_norm": 0.980780839920044, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 4560 + }, + { + "epoch": 4.452021432050658, + "grad_norm": 1.0147196054458618, + "learning_rate": 0.0002, + "loss": 1.2597, + "step": 4570 + }, + { + "epoch": 4.461763273258645, + "grad_norm": 0.9763361811637878, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 4580 + }, + { + "epoch": 4.471505114466634, + "grad_norm": 1.0300798416137695, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 4590 + }, + { + "epoch": 4.481246955674623, + "grad_norm": 0.8833121657371521, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 4600 + }, + { + "epoch": 4.490988796882611, + "grad_norm": 1.1214020252227783, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 4610 + }, + { + "epoch": 4.500730638090599, + "grad_norm": 0.8843787908554077, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 4620 + }, + { + "epoch": 4.5104724792985875, + "grad_norm": 0.9942020773887634, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 4630 + }, + { + "epoch": 4.520214320506576, + "grad_norm": 1.0033202171325684, + "learning_rate": 0.0002, + "loss": 1.3172, + "step": 4640 + }, + { + "epoch": 4.529956161714564, + "grad_norm": 0.8767235279083252, + "learning_rate": 0.0002, + "loss": 1.2024, + "step": 4650 + }, + { + "epoch": 4.539698002922552, + "grad_norm": 1.0117276906967163, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 4660 + }, + { + "epoch": 4.549439844130541, + "grad_norm": 1.2787362337112427, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4670 + }, + { + "epoch": 4.559181685338529, + "grad_norm": 0.8824878931045532, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 4680 + }, + { + "epoch": 4.568923526546517, + "grad_norm": 0.9209560751914978, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4690 + }, + { + "epoch": 4.578665367754506, + "grad_norm": 1.1064010858535767, + "learning_rate": 0.0002, + "loss": 1.1916, + "step": 4700 + }, + { + "epoch": 4.588407208962494, + "grad_norm": 0.8914572596549988, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 4710 + }, + { + "epoch": 4.598149050170482, + "grad_norm": 1.0412265062332153, + "learning_rate": 0.0002, + "loss": 1.2861, + "step": 4720 + }, + { + "epoch": 4.607890891378471, + "grad_norm": 1.1950221061706543, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4730 + }, + { + "epoch": 4.617632732586459, + "grad_norm": 0.8938062787055969, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 4740 + }, + { + "epoch": 4.627374573794447, + "grad_norm": 0.9849569201469421, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4750 + }, + { + "epoch": 4.637116415002436, + "grad_norm": 1.0081515312194824, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4760 + }, + { + "epoch": 4.6468582562104235, + "grad_norm": 0.8566309213638306, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 4770 + }, + { + "epoch": 4.656600097418412, + "grad_norm": 1.1750118732452393, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 4780 + }, + { + "epoch": 4.666341938626401, + "grad_norm": 0.925502598285675, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 4790 + }, + { + "epoch": 4.676083779834388, + "grad_norm": 1.0402472019195557, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 4800 + }, + { + "epoch": 4.685825621042377, + "grad_norm": 0.9772472977638245, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 4810 + }, + { + "epoch": 4.695567462250366, + "grad_norm": 0.9082779288291931, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 4820 + }, + { + "epoch": 4.705309303458353, + "grad_norm": 0.8026862740516663, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 4830 + }, + { + "epoch": 4.715051144666342, + "grad_norm": 1.1631089448928833, + "learning_rate": 0.0002, + "loss": 1.3369, + "step": 4840 + }, + { + "epoch": 4.7247929858743305, + "grad_norm": 0.9384787678718567, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4850 + }, + { + "epoch": 4.734534827082318, + "grad_norm": 1.2151581048965454, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 4860 + }, + { + "epoch": 4.744276668290307, + "grad_norm": 0.9679436087608337, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 4870 + }, + { + "epoch": 4.754018509498295, + "grad_norm": 0.8352158069610596, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 4880 + }, + { + "epoch": 4.763760350706283, + "grad_norm": 1.0205804109573364, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 4890 + }, + { + "epoch": 4.773502191914272, + "grad_norm": 0.9814772605895996, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 4900 + }, + { + "epoch": 4.78324403312226, + "grad_norm": 1.002854347229004, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 4910 + }, + { + "epoch": 4.792985874330248, + "grad_norm": 1.1609505414962769, + "learning_rate": 0.0002, + "loss": 1.3143, + "step": 4920 + }, + { + "epoch": 4.802727715538237, + "grad_norm": 0.9354982376098633, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 4930 + }, + { + "epoch": 4.812469556746225, + "grad_norm": 0.9761685729026794, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 4940 + }, + { + "epoch": 4.822211397954213, + "grad_norm": 1.0604596138000488, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 4950 + }, + { + "epoch": 4.8319532391622015, + "grad_norm": 1.0902808904647827, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 4960 + }, + { + "epoch": 4.84169508037019, + "grad_norm": 1.0174955129623413, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4970 + }, + { + "epoch": 4.851436921578179, + "grad_norm": 1.0995253324508667, + "learning_rate": 0.0002, + "loss": 1.3141, + "step": 4980 + }, + { + "epoch": 4.8611787627861665, + "grad_norm": 0.880993127822876, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4990 + }, + { + "epoch": 4.870920603994155, + "grad_norm": 0.9472237825393677, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 5000 + }, + { + "epoch": 4.880662445202143, + "grad_norm": 0.9504236578941345, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5010 + }, + { + "epoch": 4.890404286410131, + "grad_norm": 1.1261742115020752, + "learning_rate": 0.0002, + "loss": 1.2791, + "step": 5020 + }, + { + "epoch": 4.90014612761812, + "grad_norm": 0.904674768447876, + "learning_rate": 0.0002, + "loss": 1.3707, + "step": 5030 + }, + { + "epoch": 4.909887968826109, + "grad_norm": 0.8828991055488586, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5040 + }, + { + "epoch": 4.919629810034096, + "grad_norm": 1.0156532526016235, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 5050 + }, + { + "epoch": 4.929371651242085, + "grad_norm": 0.8975168466567993, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 5060 + }, + { + "epoch": 4.939113492450073, + "grad_norm": 0.9787213802337646, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 5070 + }, + { + "epoch": 4.948855333658061, + "grad_norm": 1.0801568031311035, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 5080 + }, + { + "epoch": 4.95859717486605, + "grad_norm": 1.0655089616775513, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 5090 + }, + { + "epoch": 4.968339016074038, + "grad_norm": 0.8941320180892944, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 5100 + }, + { + "epoch": 4.978080857282026, + "grad_norm": 1.050621747970581, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 5110 + }, + { + "epoch": 4.987822698490015, + "grad_norm": 0.9724781513214111, + "learning_rate": 0.0002, + "loss": 1.3791, + "step": 5120 + }, + { + "epoch": 4.997564539698003, + "grad_norm": 0.9850538969039917, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 5130 + }, + { + "epoch": 4.9995129079396, + "eval_loss": 2.0824170112609863, + "eval_runtime": 55.592, + "eval_samples_per_second": 9.12, + "eval_steps_per_second": 1.151, + "step": 5132 + }, + { + "epoch": 5.007306380905991, + "grad_norm": 1.0096189975738525, + "learning_rate": 0.0002, + "loss": 1.037, + "step": 5140 + }, + { + "epoch": 5.01704822211398, + "grad_norm": 1.2403408288955688, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 5150 + }, + { + "epoch": 5.026790063321968, + "grad_norm": 1.1243221759796143, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 5160 + }, + { + "epoch": 5.036531904529956, + "grad_norm": 1.4745502471923828, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 5170 + }, + { + "epoch": 5.0462737457379445, + "grad_norm": 1.1913198232650757, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 5180 + }, + { + "epoch": 5.056015586945933, + "grad_norm": 1.2732855081558228, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 5190 + }, + { + "epoch": 5.065757428153921, + "grad_norm": 1.1737396717071533, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 5200 + }, + { + "epoch": 5.075499269361909, + "grad_norm": 1.4162768125534058, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 5210 + }, + { + "epoch": 5.085241110569898, + "grad_norm": 1.528274655342102, + "learning_rate": 0.0002, + "loss": 1.0333, + "step": 5220 + }, + { + "epoch": 5.094982951777886, + "grad_norm": 1.3966618776321411, + "learning_rate": 0.0002, + "loss": 1.0227, + "step": 5230 + }, + { + "epoch": 5.104724792985874, + "grad_norm": 1.3427953720092773, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 5240 + }, + { + "epoch": 5.114466634193863, + "grad_norm": 1.6533905267715454, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 5250 + }, + { + "epoch": 5.124208475401851, + "grad_norm": 1.4114865064620972, + "learning_rate": 0.0002, + "loss": 1.0452, + "step": 5260 + }, + { + "epoch": 5.133950316609839, + "grad_norm": 1.5460708141326904, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 5270 + }, + { + "epoch": 5.143692157817828, + "grad_norm": 1.3491919040679932, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5280 + }, + { + "epoch": 5.153433999025816, + "grad_norm": 1.2208969593048096, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 5290 + }, + { + "epoch": 5.163175840233804, + "grad_norm": 1.1141403913497925, + "learning_rate": 0.0002, + "loss": 1.0362, + "step": 5300 + }, + { + "epoch": 5.172917681441793, + "grad_norm": 1.2938064336776733, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 5310 + }, + { + "epoch": 5.1826595226497805, + "grad_norm": 1.2704918384552002, + "learning_rate": 0.0002, + "loss": 1.0438, + "step": 5320 + }, + { + "epoch": 5.192401363857769, + "grad_norm": 1.3928544521331787, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 5330 + }, + { + "epoch": 5.202143205065758, + "grad_norm": 1.1993824243545532, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 5340 + }, + { + "epoch": 5.211885046273745, + "grad_norm": 1.5913670063018799, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 5350 + }, + { + "epoch": 5.221626887481734, + "grad_norm": 1.1577855348587036, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 5360 + }, + { + "epoch": 5.231368728689723, + "grad_norm": 1.4535993337631226, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 5370 + }, + { + "epoch": 5.24111056989771, + "grad_norm": 1.5068976879119873, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 5380 + }, + { + "epoch": 5.250852411105699, + "grad_norm": 1.2365459203720093, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 5390 + }, + { + "epoch": 5.2605942523136875, + "grad_norm": 1.3197922706604004, + "learning_rate": 0.0002, + "loss": 1.0145, + "step": 5400 + }, + { + "epoch": 5.270336093521675, + "grad_norm": 1.2395117282867432, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 5410 + }, + { + "epoch": 5.280077934729664, + "grad_norm": 1.1841236352920532, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 5420 + }, + { + "epoch": 5.289819775937652, + "grad_norm": 1.218003749847412, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 5430 + }, + { + "epoch": 5.29956161714564, + "grad_norm": 1.2210947275161743, + "learning_rate": 0.0002, + "loss": 1.0093, + "step": 5440 + }, + { + "epoch": 5.309303458353629, + "grad_norm": 1.266006588935852, + "learning_rate": 0.0002, + "loss": 0.9619, + "step": 5450 + }, + { + "epoch": 5.319045299561617, + "grad_norm": 1.2598075866699219, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 5460 + }, + { + "epoch": 5.328787140769606, + "grad_norm": 1.2410019636154175, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 5470 + }, + { + "epoch": 5.338528981977594, + "grad_norm": 1.249698519706726, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 5480 + }, + { + "epoch": 5.348270823185582, + "grad_norm": 1.2398173809051514, + "learning_rate": 0.0002, + "loss": 1.0457, + "step": 5490 + }, + { + "epoch": 5.35801266439357, + "grad_norm": 1.2416654825210571, + "learning_rate": 0.0002, + "loss": 1.0139, + "step": 5500 + }, + { + "epoch": 5.3677545056015585, + "grad_norm": 1.398706316947937, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 5510 + }, + { + "epoch": 5.377496346809547, + "grad_norm": 1.3049418926239014, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 5520 + }, + { + "epoch": 5.387238188017536, + "grad_norm": 1.2528893947601318, + "learning_rate": 0.0002, + "loss": 1.0912, + "step": 5530 + }, + { + "epoch": 5.3969800292255234, + "grad_norm": 1.2963255643844604, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 5540 + }, + { + "epoch": 5.406721870433512, + "grad_norm": 1.494231104850769, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 5550 + }, + { + "epoch": 5.416463711641501, + "grad_norm": 1.2760992050170898, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 5560 + }, + { + "epoch": 5.426205552849488, + "grad_norm": 1.195292592048645, + "learning_rate": 0.0002, + "loss": 1.1088, + "step": 5570 + }, + { + "epoch": 5.435947394057477, + "grad_norm": 1.6408965587615967, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 5580 + }, + { + "epoch": 5.4456892352654656, + "grad_norm": 1.3092058897018433, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 5590 + }, + { + "epoch": 5.455431076473453, + "grad_norm": 1.2960586547851562, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 5600 + }, + { + "epoch": 5.465172917681442, + "grad_norm": 1.3560487031936646, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 5610 + }, + { + "epoch": 5.4749147588894305, + "grad_norm": 1.1896311044692993, + "learning_rate": 0.0002, + "loss": 1.0314, + "step": 5620 + }, + { + "epoch": 5.484656600097418, + "grad_norm": 1.3145595788955688, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 5630 + }, + { + "epoch": 5.494398441305407, + "grad_norm": 1.2207404375076294, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 5640 + }, + { + "epoch": 5.504140282513395, + "grad_norm": 1.266015887260437, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 5650 + }, + { + "epoch": 5.513882123721383, + "grad_norm": 1.2478289604187012, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 5660 + }, + { + "epoch": 5.523623964929372, + "grad_norm": 1.4851372241973877, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 5670 + }, + { + "epoch": 5.53336580613736, + "grad_norm": 1.4478679895401, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5680 + }, + { + "epoch": 5.543107647345348, + "grad_norm": 1.1079537868499756, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 5690 + }, + { + "epoch": 5.552849488553337, + "grad_norm": 1.4201879501342773, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 5700 + }, + { + "epoch": 5.562591329761325, + "grad_norm": 1.2092000246047974, + "learning_rate": 0.0002, + "loss": 1.0697, + "step": 5710 + }, + { + "epoch": 5.572333170969313, + "grad_norm": 1.4515851736068726, + "learning_rate": 0.0002, + "loss": 0.9868, + "step": 5720 + }, + { + "epoch": 5.5820750121773015, + "grad_norm": 1.3260412216186523, + "learning_rate": 0.0002, + "loss": 1.1547, + "step": 5730 + }, + { + "epoch": 5.59181685338529, + "grad_norm": 1.248191475868225, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 5740 + }, + { + "epoch": 5.601558694593278, + "grad_norm": 1.2037307024002075, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 5750 + }, + { + "epoch": 5.611300535801266, + "grad_norm": 1.341237187385559, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 5760 + }, + { + "epoch": 5.621042377009255, + "grad_norm": 1.130115270614624, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 5770 + }, + { + "epoch": 5.630784218217243, + "grad_norm": 1.3834772109985352, + "learning_rate": 0.0002, + "loss": 1.1029, + "step": 5780 + }, + { + "epoch": 5.640526059425231, + "grad_norm": 1.2586270570755005, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 5790 + }, + { + "epoch": 5.65026790063322, + "grad_norm": 1.3233023881912231, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 5800 + }, + { + "epoch": 5.660009741841208, + "grad_norm": 1.2711341381072998, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 5810 + }, + { + "epoch": 5.669751583049196, + "grad_norm": 1.3867720365524292, + "learning_rate": 0.0002, + "loss": 1.0897, + "step": 5820 + }, + { + "epoch": 5.679493424257185, + "grad_norm": 1.4783269166946411, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 5830 + }, + { + "epoch": 5.6892352654651726, + "grad_norm": 1.2744768857955933, + "learning_rate": 0.0002, + "loss": 1.0632, + "step": 5840 + }, + { + "epoch": 5.698977106673161, + "grad_norm": 1.3405882120132446, + "learning_rate": 0.0002, + "loss": 1.1484, + "step": 5850 + }, + { + "epoch": 5.70871894788115, + "grad_norm": 1.204300880432129, + "learning_rate": 0.0002, + "loss": 1.0975, + "step": 5860 + }, + { + "epoch": 5.7184607890891375, + "grad_norm": 1.2954572439193726, + "learning_rate": 0.0002, + "loss": 1.0494, + "step": 5870 + }, + { + "epoch": 5.728202630297126, + "grad_norm": 1.5478382110595703, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 5880 + }, + { + "epoch": 5.737944471505115, + "grad_norm": 1.2095842361450195, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 5890 + }, + { + "epoch": 5.747686312713103, + "grad_norm": 1.0691519975662231, + "learning_rate": 0.0002, + "loss": 1.1, + "step": 5900 + }, + { + "epoch": 5.757428153921091, + "grad_norm": 1.1920677423477173, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 5910 + }, + { + "epoch": 5.76716999512908, + "grad_norm": 1.2051277160644531, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 5920 + }, + { + "epoch": 5.776911836337067, + "grad_norm": 1.197490930557251, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 5930 + }, + { + "epoch": 5.786653677545056, + "grad_norm": 1.2003998756408691, + "learning_rate": 0.0002, + "loss": 1.07, + "step": 5940 + }, + { + "epoch": 5.7963955187530445, + "grad_norm": 1.2323646545410156, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 5950 + }, + { + "epoch": 5.806137359961033, + "grad_norm": 1.2593932151794434, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 5960 + }, + { + "epoch": 5.815879201169021, + "grad_norm": 1.1835976839065552, + "learning_rate": 0.0002, + "loss": 1.0829, + "step": 5970 + }, + { + "epoch": 5.825621042377009, + "grad_norm": 1.4770104885101318, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 5980 + }, + { + "epoch": 5.835362883584997, + "grad_norm": 1.1025809049606323, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 5990 + }, + { + "epoch": 5.845104724792986, + "grad_norm": 1.364588975906372, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 6000 + }, + { + "epoch": 5.854846566000974, + "grad_norm": 1.2340112924575806, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 6010 + }, + { + "epoch": 5.864588407208963, + "grad_norm": 1.4925711154937744, + "learning_rate": 0.0002, + "loss": 1.1123, + "step": 6020 + }, + { + "epoch": 5.874330248416951, + "grad_norm": 1.3516744375228882, + "learning_rate": 0.0002, + "loss": 1.12, + "step": 6030 + }, + { + "epoch": 5.884072089624939, + "grad_norm": 1.2058138847351074, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 6040 + }, + { + "epoch": 5.893813930832927, + "grad_norm": 1.13870108127594, + "learning_rate": 0.0002, + "loss": 1.1074, + "step": 6050 + }, + { + "epoch": 5.9035557720409155, + "grad_norm": 1.1587319374084473, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 6060 + }, + { + "epoch": 5.913297613248904, + "grad_norm": 1.164481520652771, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 6070 + }, + { + "epoch": 5.923039454456893, + "grad_norm": 1.2115206718444824, + "learning_rate": 0.0002, + "loss": 1.1262, + "step": 6080 + }, + { + "epoch": 5.93278129566488, + "grad_norm": 1.3201590776443481, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 6090 + }, + { + "epoch": 5.942523136872869, + "grad_norm": 1.287380576133728, + "learning_rate": 0.0002, + "loss": 1.1288, + "step": 6100 + }, + { + "epoch": 5.952264978080858, + "grad_norm": 1.1820166110992432, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6110 + }, + { + "epoch": 5.962006819288845, + "grad_norm": 1.2550667524337769, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 6120 + }, + { + "epoch": 5.971748660496834, + "grad_norm": 1.3547813892364502, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 6130 + }, + { + "epoch": 5.9814905017048225, + "grad_norm": 1.260842204093933, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 6140 + }, + { + "epoch": 5.99123234291281, + "grad_norm": 1.1643036603927612, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 6150 + }, + { + "epoch": 6.0, + "eval_loss": 2.2628161907196045, + "eval_runtime": 57.2379, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 1.118, + "step": 6159 + }, + { + "epoch": 6.000974184120799, + "grad_norm": 0.9384723901748657, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 6160 + }, + { + "epoch": 6.0107160253287875, + "grad_norm": 2.1525821685791016, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6170 + }, + { + "epoch": 6.020457866536775, + "grad_norm": 2.0194077491760254, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 6180 + }, + { + "epoch": 6.030199707744764, + "grad_norm": 1.5257816314697266, + "learning_rate": 0.0002, + "loss": 0.8443, + "step": 6190 + }, + { + "epoch": 6.039941548952752, + "grad_norm": 1.5432662963867188, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 6200 + }, + { + "epoch": 6.04968339016074, + "grad_norm": 1.6874405145645142, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 6210 + }, + { + "epoch": 6.059425231368729, + "grad_norm": 1.7346407175064087, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 6220 + }, + { + "epoch": 6.069167072576717, + "grad_norm": 1.5320781469345093, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6230 + }, + { + "epoch": 6.078908913784705, + "grad_norm": 1.4106669425964355, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 6240 + }, + { + "epoch": 6.088650754992694, + "grad_norm": 1.5568628311157227, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 6250 + }, + { + "epoch": 6.098392596200682, + "grad_norm": 1.6155978441238403, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 6260 + }, + { + "epoch": 6.10813443740867, + "grad_norm": 1.4820445775985718, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 6270 + }, + { + "epoch": 6.1178762786166585, + "grad_norm": 1.6163820028305054, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 6280 + }, + { + "epoch": 6.127618119824647, + "grad_norm": 1.8396387100219727, + "learning_rate": 0.0002, + "loss": 0.853, + "step": 6290 + }, + { + "epoch": 6.137359961032635, + "grad_norm": 1.7181230783462524, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 6300 + }, + { + "epoch": 6.147101802240623, + "grad_norm": 1.6568509340286255, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 6310 + }, + { + "epoch": 6.156843643448612, + "grad_norm": 1.3481947183609009, + "learning_rate": 0.0002, + "loss": 0.8525, + "step": 6320 + }, + { + "epoch": 6.1665854846566, + "grad_norm": 1.5788342952728271, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 6330 + }, + { + "epoch": 6.176327325864588, + "grad_norm": 1.5067620277404785, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 6340 + }, + { + "epoch": 6.186069167072577, + "grad_norm": 1.8198208808898926, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 6350 + }, + { + "epoch": 6.195811008280565, + "grad_norm": 1.4012749195098877, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6360 + }, + { + "epoch": 6.205552849488553, + "grad_norm": 1.759798288345337, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 6370 + }, + { + "epoch": 6.215294690696542, + "grad_norm": 1.468922734260559, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6380 + }, + { + "epoch": 6.2250365319045295, + "grad_norm": 1.3706471920013428, + "learning_rate": 0.0002, + "loss": 0.8356, + "step": 6390 + }, + { + "epoch": 6.234778373112518, + "grad_norm": 1.6397383213043213, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 6400 + }, + { + "epoch": 6.244520214320507, + "grad_norm": 1.5614187717437744, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 6410 + }, + { + "epoch": 6.2542620555284945, + "grad_norm": 1.7118678092956543, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 6420 + }, + { + "epoch": 6.264003896736483, + "grad_norm": 1.4041547775268555, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 6430 + }, + { + "epoch": 6.273745737944472, + "grad_norm": 1.7653605937957764, + "learning_rate": 0.0002, + "loss": 0.879, + "step": 6440 + }, + { + "epoch": 6.28348757915246, + "grad_norm": 2.6219191551208496, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 6450 + }, + { + "epoch": 6.293229420360448, + "grad_norm": 1.4757837057113647, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 6460 + }, + { + "epoch": 6.302971261568437, + "grad_norm": 1.715598225593567, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 6470 + }, + { + "epoch": 6.312713102776424, + "grad_norm": 1.376216173171997, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 6480 + }, + { + "epoch": 6.322454943984413, + "grad_norm": 1.7119828462600708, + "learning_rate": 0.0002, + "loss": 0.8742, + "step": 6490 + }, + { + "epoch": 6.3321967851924015, + "grad_norm": 1.4304355382919312, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 6500 + }, + { + "epoch": 6.34193862640039, + "grad_norm": 1.4889872074127197, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 6510 + }, + { + "epoch": 6.351680467608378, + "grad_norm": 1.370373010635376, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 6520 + }, + { + "epoch": 6.361422308816366, + "grad_norm": 1.7697709798812866, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 6530 + }, + { + "epoch": 6.371164150024355, + "grad_norm": 1.495297908782959, + "learning_rate": 0.0002, + "loss": 0.9421, + "step": 6540 + }, + { + "epoch": 6.380905991232343, + "grad_norm": 1.7251347303390503, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 6550 + }, + { + "epoch": 6.390647832440331, + "grad_norm": 1.6909505128860474, + "learning_rate": 0.0002, + "loss": 0.9327, + "step": 6560 + }, + { + "epoch": 6.40038967364832, + "grad_norm": 1.4369314908981323, + "learning_rate": 0.0002, + "loss": 0.837, + "step": 6570 + }, + { + "epoch": 6.410131514856308, + "grad_norm": 1.7803739309310913, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 6580 + }, + { + "epoch": 6.419873356064296, + "grad_norm": 1.6107097864151, + "learning_rate": 0.0002, + "loss": 0.9024, + "step": 6590 + }, + { + "epoch": 6.429615197272285, + "grad_norm": 1.6151643991470337, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 6600 + }, + { + "epoch": 6.4393570384802725, + "grad_norm": 1.7159833908081055, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 6610 + }, + { + "epoch": 6.449098879688261, + "grad_norm": 1.4366064071655273, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 6620 + }, + { + "epoch": 6.45884072089625, + "grad_norm": 1.6050453186035156, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 6630 + }, + { + "epoch": 6.468582562104237, + "grad_norm": 1.6296740770339966, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 6640 + }, + { + "epoch": 6.478324403312226, + "grad_norm": 1.6181174516677856, + "learning_rate": 0.0002, + "loss": 0.9228, + "step": 6650 + }, + { + "epoch": 6.488066244520215, + "grad_norm": 1.5452176332473755, + "learning_rate": 0.0002, + "loss": 0.9139, + "step": 6660 + }, + { + "epoch": 6.497808085728202, + "grad_norm": 1.3919731378555298, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 6670 + }, + { + "epoch": 6.507549926936191, + "grad_norm": 1.6456257104873657, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 6680 + }, + { + "epoch": 6.5172917681441795, + "grad_norm": 1.4147369861602783, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 6690 + }, + { + "epoch": 6.527033609352167, + "grad_norm": 1.7005025148391724, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 6700 + }, + { + "epoch": 6.536775450560156, + "grad_norm": 1.6032357215881348, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 6710 + }, + { + "epoch": 6.5465172917681445, + "grad_norm": 1.3454229831695557, + "learning_rate": 0.0002, + "loss": 0.9796, + "step": 6720 + }, + { + "epoch": 6.556259132976132, + "grad_norm": 1.6961418390274048, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 6730 + }, + { + "epoch": 6.566000974184121, + "grad_norm": 1.78407883644104, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 6740 + }, + { + "epoch": 6.575742815392109, + "grad_norm": 1.6817889213562012, + "learning_rate": 0.0002, + "loss": 0.8941, + "step": 6750 + }, + { + "epoch": 6.585484656600097, + "grad_norm": 1.7894943952560425, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 6760 + }, + { + "epoch": 6.595226497808086, + "grad_norm": 1.6404837369918823, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 6770 + }, + { + "epoch": 6.604968339016074, + "grad_norm": 1.5849255323410034, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 6780 + }, + { + "epoch": 6.614710180224062, + "grad_norm": 1.5993813276290894, + "learning_rate": 0.0002, + "loss": 0.9575, + "step": 6790 + }, + { + "epoch": 6.624452021432051, + "grad_norm": 1.2834863662719727, + "learning_rate": 0.0002, + "loss": 0.8922, + "step": 6800 + }, + { + "epoch": 6.634193862640039, + "grad_norm": 1.7215641736984253, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 6810 + }, + { + "epoch": 6.643935703848027, + "grad_norm": 1.7588146924972534, + "learning_rate": 0.0002, + "loss": 0.9292, + "step": 6820 + }, + { + "epoch": 6.6536775450560155, + "grad_norm": 1.7956023216247559, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 6830 + }, + { + "epoch": 6.663419386264004, + "grad_norm": 1.5115351676940918, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 6840 + }, + { + "epoch": 6.673161227471992, + "grad_norm": 1.5660319328308105, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 6850 + }, + { + "epoch": 6.68290306867998, + "grad_norm": 1.4323679208755493, + "learning_rate": 0.0002, + "loss": 0.9877, + "step": 6860 + }, + { + "epoch": 6.692644909887969, + "grad_norm": 1.662089467048645, + "learning_rate": 0.0002, + "loss": 0.8732, + "step": 6870 + }, + { + "epoch": 6.702386751095958, + "grad_norm": 1.7854869365692139, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6880 + }, + { + "epoch": 6.712128592303945, + "grad_norm": 1.5491222143173218, + "learning_rate": 0.0002, + "loss": 0.9105, + "step": 6890 + }, + { + "epoch": 6.721870433511934, + "grad_norm": 1.5946987867355347, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 6900 + }, + { + "epoch": 6.731612274719922, + "grad_norm": 1.6195964813232422, + "learning_rate": 0.0002, + "loss": 0.9391, + "step": 6910 + }, + { + "epoch": 6.74135411592791, + "grad_norm": 1.6366901397705078, + "learning_rate": 0.0002, + "loss": 0.8947, + "step": 6920 + }, + { + "epoch": 6.751095957135899, + "grad_norm": 1.5080382823944092, + "learning_rate": 0.0002, + "loss": 0.8695, + "step": 6930 + }, + { + "epoch": 6.760837798343887, + "grad_norm": 1.742353916168213, + "learning_rate": 0.0002, + "loss": 0.9124, + "step": 6940 + }, + { + "epoch": 6.770579639551875, + "grad_norm": 1.690251111984253, + "learning_rate": 0.0002, + "loss": 0.9118, + "step": 6950 + }, + { + "epoch": 6.780321480759864, + "grad_norm": 1.7103357315063477, + "learning_rate": 0.0002, + "loss": 0.9039, + "step": 6960 + }, + { + "epoch": 6.7900633219678515, + "grad_norm": 1.6630914211273193, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 6970 + }, + { + "epoch": 6.79980516317584, + "grad_norm": 1.423768162727356, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 6980 + }, + { + "epoch": 6.809547004383829, + "grad_norm": 1.7844693660736084, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 6990 + }, + { + "epoch": 6.819288845591817, + "grad_norm": 1.545282006263733, + "learning_rate": 0.0002, + "loss": 0.8889, + "step": 7000 + }, + { + "epoch": 6.829030686799805, + "grad_norm": 1.4340319633483887, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 7010 + }, + { + "epoch": 6.838772528007794, + "grad_norm": 1.5981626510620117, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 7020 + }, + { + "epoch": 6.848514369215782, + "grad_norm": 1.5205026865005493, + "learning_rate": 0.0002, + "loss": 0.9062, + "step": 7030 + }, + { + "epoch": 6.85825621042377, + "grad_norm": 1.6999989748001099, + "learning_rate": 0.0002, + "loss": 0.9245, + "step": 7040 + }, + { + "epoch": 6.8679980516317585, + "grad_norm": 1.6392347812652588, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 7050 + }, + { + "epoch": 6.877739892839747, + "grad_norm": 1.637308955192566, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 7060 + }, + { + "epoch": 6.887481734047735, + "grad_norm": 1.671341896057129, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 7070 + }, + { + "epoch": 6.897223575255723, + "grad_norm": 1.4437555074691772, + "learning_rate": 0.0002, + "loss": 0.9726, + "step": 7080 + }, + { + "epoch": 6.906965416463712, + "grad_norm": 1.4251935482025146, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 7090 + }, + { + "epoch": 6.9167072576717, + "grad_norm": 1.5106734037399292, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 7100 + }, + { + "epoch": 6.926449098879688, + "grad_norm": 1.670742154121399, + "learning_rate": 0.0002, + "loss": 0.939, + "step": 7110 + }, + { + "epoch": 6.936190940087677, + "grad_norm": 1.4353723526000977, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 7120 + }, + { + "epoch": 6.945932781295665, + "grad_norm": 1.9437772035598755, + "learning_rate": 0.0002, + "loss": 0.9354, + "step": 7130 + }, + { + "epoch": 6.955674622503653, + "grad_norm": 1.4922038316726685, + "learning_rate": 0.0002, + "loss": 0.9623, + "step": 7140 + }, + { + "epoch": 6.965416463711642, + "grad_norm": 1.489193081855774, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 7150 + }, + { + "epoch": 6.9751583049196295, + "grad_norm": 1.529490351676941, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 7160 + }, + { + "epoch": 6.984900146127618, + "grad_norm": 1.7370105981826782, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 7170 + }, + { + "epoch": 6.994641987335607, + "grad_norm": 1.5639604330062866, + "learning_rate": 0.0002, + "loss": 0.921, + "step": 7180 + }, + { + "epoch": 6.9995129079396, + "eval_loss": 2.521758794784546, + "eval_runtime": 56.1587, + "eval_samples_per_second": 9.028, + "eval_steps_per_second": 1.14, + "step": 7185 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.690741942111437e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-7185/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5433538c92f49a9f359ba0c5182ab149dcf1a196 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e2ff42e571db69511d0848cefcaf9b1f9e59bf1c3e3ff6a29e447e500c46c59 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e8de090c68300eab35c5d8fcfd2ddc0f40b2822 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd0e8db97eb715d41d3018b5c2c2d671cb3796d3b95a0cbb6c266c4c8af5439b +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7dd8bf7456fad9edd68113143d4a1cd8d1cc947e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7432f83a4c6152fd92ed8ca152d25a2c7c587d2430fe61817434dc641419d9 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..82d33307f21179c52d408ea20eeb125b85b01b25 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a123ed91b99863f6a4bfffa749f6e9055a4cad42fec0e03d6a304d1cc972c68f +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..99932528bdebbe9c06f4d5ebbaa98b1ec80f8833 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/trainer_state.json @@ -0,0 +1,5837 @@ +{ + "best_metric": 1.8103164434432983, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", + "epoch": 7.996103263516805, + "eval_steps": 10, + "global_step": 8208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00974184120798831, + "grad_norm": 0.6537588834762573, + "learning_rate": 0.0002, + "loss": 3.0782, + "step": 10 + }, + { + "epoch": 0.01948368241597662, + "grad_norm": 0.5270306468009949, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 20 + }, + { + "epoch": 0.029225523623964928, + "grad_norm": 0.6826501488685608, + "learning_rate": 0.0002, + "loss": 2.2589, + "step": 30 + }, + { + "epoch": 0.03896736483195324, + "grad_norm": 0.5061377286911011, + "learning_rate": 0.0002, + "loss": 2.0141, + "step": 40 + }, + { + "epoch": 0.04870920603994155, + "grad_norm": 0.4300410747528076, + "learning_rate": 0.0002, + "loss": 1.9458, + "step": 50 + }, + { + "epoch": 0.058451047247929856, + "grad_norm": 0.5063319802284241, + "learning_rate": 0.0002, + "loss": 1.983, + "step": 60 + }, + { + "epoch": 0.06819288845591817, + "grad_norm": 0.49310117959976196, + "learning_rate": 0.0002, + "loss": 1.9799, + "step": 70 + }, + { + "epoch": 0.07793472966390648, + "grad_norm": 0.4676004648208618, + "learning_rate": 0.0002, + "loss": 1.9277, + "step": 80 + }, + { + "epoch": 0.08767657087189479, + "grad_norm": 0.41647228598594666, + "learning_rate": 0.0002, + "loss": 1.9147, + "step": 90 + }, + { + "epoch": 0.0974184120798831, + "grad_norm": 0.40217313170433044, + "learning_rate": 0.0002, + "loss": 1.8894, + "step": 100 + }, + { + "epoch": 0.1071602532878714, + "grad_norm": 0.4123637080192566, + "learning_rate": 0.0002, + "loss": 1.9099, + "step": 110 + }, + { + "epoch": 0.11690209449585971, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.0002, + "loss": 1.8471, + "step": 120 + }, + { + "epoch": 0.12664393570384802, + "grad_norm": 0.3179326355457306, + "learning_rate": 0.0002, + "loss": 1.894, + "step": 130 + }, + { + "epoch": 0.13638577691183634, + "grad_norm": 0.3548192083835602, + "learning_rate": 0.0002, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.14612761811982464, + "grad_norm": 0.3273540139198303, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 150 + }, + { + "epoch": 0.15586945932781296, + "grad_norm": 0.36500975489616394, + "learning_rate": 0.0002, + "loss": 1.8496, + "step": 160 + }, + { + "epoch": 0.16561130053580125, + "grad_norm": 0.4106619656085968, + "learning_rate": 0.0002, + "loss": 1.8473, + "step": 170 + }, + { + "epoch": 0.17535314174378958, + "grad_norm": 0.41361644864082336, + "learning_rate": 0.0002, + "loss": 1.841, + "step": 180 + }, + { + "epoch": 0.1850949829517779, + "grad_norm": 0.3608580231666565, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 190 + }, + { + "epoch": 0.1948368241597662, + "grad_norm": 0.4291760325431824, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 200 + }, + { + "epoch": 0.20457866536775451, + "grad_norm": 0.344184011220932, + "learning_rate": 0.0002, + "loss": 1.8437, + "step": 210 + }, + { + "epoch": 0.2143205065757428, + "grad_norm": 0.3834705650806427, + "learning_rate": 0.0002, + "loss": 1.8779, + "step": 220 + }, + { + "epoch": 0.22406234778373113, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 230 + }, + { + "epoch": 0.23380418899171942, + "grad_norm": 0.4306780695915222, + "learning_rate": 0.0002, + "loss": 1.824, + "step": 240 + }, + { + "epoch": 0.24354603019970775, + "grad_norm": 0.5066465139389038, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 250 + }, + { + "epoch": 0.25328787140769604, + "grad_norm": 0.34227681159973145, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 260 + }, + { + "epoch": 0.26302971261568436, + "grad_norm": 0.3346865475177765, + "learning_rate": 0.0002, + "loss": 1.8614, + "step": 270 + }, + { + "epoch": 0.2727715538236727, + "grad_norm": 0.3639362156391144, + "learning_rate": 0.0002, + "loss": 1.8502, + "step": 280 + }, + { + "epoch": 0.282513395031661, + "grad_norm": 0.33223700523376465, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 290 + }, + { + "epoch": 0.2922552362396493, + "grad_norm": 0.35176315903663635, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 300 + }, + { + "epoch": 0.3019970774476376, + "grad_norm": 0.3581472635269165, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 310 + }, + { + "epoch": 0.3117389186556259, + "grad_norm": 0.35943421721458435, + "learning_rate": 0.0002, + "loss": 1.8262, + "step": 320 + }, + { + "epoch": 0.32148075986361424, + "grad_norm": 0.322051078081131, + "learning_rate": 0.0002, + "loss": 1.8167, + "step": 330 + }, + { + "epoch": 0.3312226010716025, + "grad_norm": 0.33904823660850525, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 340 + }, + { + "epoch": 0.34096444227959083, + "grad_norm": 0.39162731170654297, + "learning_rate": 0.0002, + "loss": 1.9159, + "step": 350 + }, + { + "epoch": 0.35070628348757915, + "grad_norm": 0.330624520778656, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 360 + }, + { + "epoch": 0.3604481246955675, + "grad_norm": 0.3793248236179352, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 370 + }, + { + "epoch": 0.3701899659035558, + "grad_norm": 0.3347395658493042, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 380 + }, + { + "epoch": 0.37993180711154406, + "grad_norm": 0.30527254939079285, + "learning_rate": 0.0002, + "loss": 1.9244, + "step": 390 + }, + { + "epoch": 0.3896736483195324, + "grad_norm": 0.3081390857696533, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 400 + }, + { + "epoch": 0.3994154895275207, + "grad_norm": 0.3742620050907135, + "learning_rate": 0.0002, + "loss": 1.8968, + "step": 410 + }, + { + "epoch": 0.40915733073550903, + "grad_norm": 0.4080568253993988, + "learning_rate": 0.0002, + "loss": 1.8095, + "step": 420 + }, + { + "epoch": 0.4188991719434973, + "grad_norm": 0.38034746050834656, + "learning_rate": 0.0002, + "loss": 1.8555, + "step": 430 + }, + { + "epoch": 0.4286410131514856, + "grad_norm": 0.34893402457237244, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 440 + }, + { + "epoch": 0.43838285435947394, + "grad_norm": 0.33285608887672424, + "learning_rate": 0.0002, + "loss": 1.8481, + "step": 450 + }, + { + "epoch": 0.44812469556746226, + "grad_norm": 0.4110095798969269, + "learning_rate": 0.0002, + "loss": 1.8466, + "step": 460 + }, + { + "epoch": 0.4578665367754506, + "grad_norm": 0.3658817410469055, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 470 + }, + { + "epoch": 0.46760837798343885, + "grad_norm": 0.31350770592689514, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 480 + }, + { + "epoch": 0.47735021919142717, + "grad_norm": 0.38827991485595703, + "learning_rate": 0.0002, + "loss": 1.7839, + "step": 490 + }, + { + "epoch": 0.4870920603994155, + "grad_norm": 0.3792393207550049, + "learning_rate": 0.0002, + "loss": 1.8224, + "step": 500 + }, + { + "epoch": 0.4968339016074038, + "grad_norm": 0.3004095256328583, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 510 + }, + { + "epoch": 0.5065757428153921, + "grad_norm": 0.3200063407421112, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 520 + }, + { + "epoch": 0.5163175840233805, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 530 + }, + { + "epoch": 0.5260594252313687, + "grad_norm": 0.30258631706237793, + "learning_rate": 0.0002, + "loss": 1.7725, + "step": 540 + }, + { + "epoch": 0.535801266439357, + "grad_norm": 0.28210392594337463, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 550 + }, + { + "epoch": 0.5455431076473454, + "grad_norm": 0.34854066371917725, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 560 + }, + { + "epoch": 0.5552849488553336, + "grad_norm": 0.31689873337745667, + "learning_rate": 0.0002, + "loss": 1.8331, + "step": 570 + }, + { + "epoch": 0.565026790063322, + "grad_norm": 0.31253790855407715, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 580 + }, + { + "epoch": 0.5747686312713103, + "grad_norm": 0.3229721188545227, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 590 + }, + { + "epoch": 0.5845104724792985, + "grad_norm": 0.3723772466182709, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 600 + }, + { + "epoch": 0.5942523136872869, + "grad_norm": 0.345798522233963, + "learning_rate": 0.0002, + "loss": 1.8357, + "step": 610 + }, + { + "epoch": 0.6039941548952752, + "grad_norm": 0.3440598249435425, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 620 + }, + { + "epoch": 0.6137359961032636, + "grad_norm": 0.3406416177749634, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 630 + }, + { + "epoch": 0.6234778373112518, + "grad_norm": 0.3218357264995575, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 640 + }, + { + "epoch": 0.6332196785192401, + "grad_norm": 0.45319172739982605, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 650 + }, + { + "epoch": 0.6429615197272285, + "grad_norm": 0.2787110507488251, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 660 + }, + { + "epoch": 0.6527033609352167, + "grad_norm": 0.3064707815647125, + "learning_rate": 0.0002, + "loss": 1.8426, + "step": 670 + }, + { + "epoch": 0.662445202143205, + "grad_norm": 0.2940629720687866, + "learning_rate": 0.0002, + "loss": 1.846, + "step": 680 + }, + { + "epoch": 0.6721870433511934, + "grad_norm": 0.31695225834846497, + "learning_rate": 0.0002, + "loss": 1.7865, + "step": 690 + }, + { + "epoch": 0.6819288845591817, + "grad_norm": 0.29589611291885376, + "learning_rate": 0.0002, + "loss": 1.8449, + "step": 700 + }, + { + "epoch": 0.69167072576717, + "grad_norm": 0.3062121570110321, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 710 + }, + { + "epoch": 0.7014125669751583, + "grad_norm": 0.3315656781196594, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 720 + }, + { + "epoch": 0.7111544081831466, + "grad_norm": 0.30353930592536926, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 730 + }, + { + "epoch": 0.720896249391135, + "grad_norm": 0.28360483050346375, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 740 + }, + { + "epoch": 0.7306380905991232, + "grad_norm": 0.3362562656402588, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 750 + }, + { + "epoch": 0.7403799318071116, + "grad_norm": 0.40434667468070984, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 760 + }, + { + "epoch": 0.7501217730150999, + "grad_norm": 0.2930425703525543, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 770 + }, + { + "epoch": 0.7598636142230881, + "grad_norm": 0.30177003145217896, + "learning_rate": 0.0002, + "loss": 1.8216, + "step": 780 + }, + { + "epoch": 0.7696054554310765, + "grad_norm": 0.2784474790096283, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 790 + }, + { + "epoch": 0.7793472966390648, + "grad_norm": 0.35849854350090027, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 800 + }, + { + "epoch": 0.7890891378470531, + "grad_norm": 0.27329114079475403, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 810 + }, + { + "epoch": 0.7988309790550414, + "grad_norm": 0.33331671357154846, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 820 + }, + { + "epoch": 0.8085728202630297, + "grad_norm": 0.28727295994758606, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 830 + }, + { + "epoch": 0.8183146614710181, + "grad_norm": 0.31391268968582153, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 840 + }, + { + "epoch": 0.8280565026790063, + "grad_norm": 0.3303709030151367, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 850 + }, + { + "epoch": 0.8377983438869946, + "grad_norm": 0.33772537112236023, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 860 + }, + { + "epoch": 0.847540185094983, + "grad_norm": 0.32876333594322205, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 870 + }, + { + "epoch": 0.8572820263029712, + "grad_norm": 0.28444716334342957, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 880 + }, + { + "epoch": 0.8670238675109596, + "grad_norm": 0.3070019483566284, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 890 + }, + { + "epoch": 0.8767657087189479, + "grad_norm": 0.29484760761260986, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 900 + }, + { + "epoch": 0.8865075499269361, + "grad_norm": 0.32373034954071045, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 910 + }, + { + "epoch": 0.8962493911349245, + "grad_norm": 0.3229396939277649, + "learning_rate": 0.0002, + "loss": 1.7799, + "step": 920 + }, + { + "epoch": 0.9059912323429128, + "grad_norm": 0.33151453733444214, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 930 + }, + { + "epoch": 0.9157330735509012, + "grad_norm": 0.32037460803985596, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 940 + }, + { + "epoch": 0.9254749147588894, + "grad_norm": 0.31283533573150635, + "learning_rate": 0.0002, + "loss": 1.822, + "step": 950 + }, + { + "epoch": 0.9352167559668777, + "grad_norm": 0.27984118461608887, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 960 + }, + { + "epoch": 0.9449585971748661, + "grad_norm": 0.316500186920166, + "learning_rate": 0.0002, + "loss": 1.7755, + "step": 970 + }, + { + "epoch": 0.9547004383828543, + "grad_norm": 0.33708682656288147, + "learning_rate": 0.0002, + "loss": 1.8032, + "step": 980 + }, + { + "epoch": 0.9644422795908427, + "grad_norm": 0.31026017665863037, + "learning_rate": 0.0002, + "loss": 1.8863, + "step": 990 + }, + { + "epoch": 0.974184120798831, + "grad_norm": 0.30874672532081604, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 1000 + }, + { + "epoch": 0.9839259620068193, + "grad_norm": 0.3257741630077362, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1010 + }, + { + "epoch": 0.9936678032148076, + "grad_norm": 0.2865653932094574, + "learning_rate": 0.0002, + "loss": 1.7936, + "step": 1020 + }, + { + "epoch": 0.9995129079396006, + "eval_loss": 1.8103164434432983, + "eval_runtime": 56.3917, + "eval_samples_per_second": 8.991, + "eval_steps_per_second": 1.135, + "step": 1026 + }, + { + "epoch": 1.003409644422796, + "grad_norm": 0.2860608398914337, + "learning_rate": 0.0002, + "loss": 1.7013, + "step": 1030 + }, + { + "epoch": 1.0131514856307842, + "grad_norm": 0.3156210780143738, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 1040 + }, + { + "epoch": 1.0228933268387725, + "grad_norm": 0.26126575469970703, + "learning_rate": 0.0002, + "loss": 1.6182, + "step": 1050 + }, + { + "epoch": 1.032635168046761, + "grad_norm": 0.3019633889198303, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 1060 + }, + { + "epoch": 1.042377009254749, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1070 + }, + { + "epoch": 1.0521188504627375, + "grad_norm": 0.311872661113739, + "learning_rate": 0.0002, + "loss": 1.664, + "step": 1080 + }, + { + "epoch": 1.0618606916707258, + "grad_norm": 0.3276001513004303, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 1090 + }, + { + "epoch": 1.071602532878714, + "grad_norm": 0.35227468609809875, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1100 + }, + { + "epoch": 1.0813443740867024, + "grad_norm": 0.3597564995288849, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 1110 + }, + { + "epoch": 1.0910862152946907, + "grad_norm": 0.3547225296497345, + "learning_rate": 0.0002, + "loss": 1.7635, + "step": 1120 + }, + { + "epoch": 1.100828056502679, + "grad_norm": 0.3399786353111267, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1130 + }, + { + "epoch": 1.1105698977106673, + "grad_norm": 0.3309086263179779, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 1140 + }, + { + "epoch": 1.1203117389186557, + "grad_norm": 0.39330706000328064, + "learning_rate": 0.0002, + "loss": 1.7372, + "step": 1150 + }, + { + "epoch": 1.130053580126644, + "grad_norm": 0.3628021776676178, + "learning_rate": 0.0002, + "loss": 1.6236, + "step": 1160 + }, + { + "epoch": 1.1397954213346322, + "grad_norm": 0.32995012402534485, + "learning_rate": 0.0002, + "loss": 1.8022, + "step": 1170 + }, + { + "epoch": 1.1495372625426206, + "grad_norm": 0.36292821168899536, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 1180 + }, + { + "epoch": 1.159279103750609, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 1190 + }, + { + "epoch": 1.169020944958597, + "grad_norm": 0.3496156334877014, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1200 + }, + { + "epoch": 1.1787627861665855, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1210 + }, + { + "epoch": 1.1885046273745739, + "grad_norm": 0.34983909130096436, + "learning_rate": 0.0002, + "loss": 1.7763, + "step": 1220 + }, + { + "epoch": 1.198246468582562, + "grad_norm": 0.36505937576293945, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1230 + }, + { + "epoch": 1.2079883097905504, + "grad_norm": 0.31624770164489746, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1240 + }, + { + "epoch": 1.2177301509985388, + "grad_norm": 0.3528020679950714, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1250 + }, + { + "epoch": 1.2274719922065271, + "grad_norm": 0.29294025897979736, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1260 + }, + { + "epoch": 1.2372138334145153, + "grad_norm": 0.35048434138298035, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1270 + }, + { + "epoch": 1.2469556746225037, + "grad_norm": 0.35224461555480957, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1280 + }, + { + "epoch": 1.256697515830492, + "grad_norm": 0.4041554629802704, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1290 + }, + { + "epoch": 1.2664393570384802, + "grad_norm": 0.3447791039943695, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1300 + }, + { + "epoch": 1.2761811982464686, + "grad_norm": 0.3315333425998688, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1310 + }, + { + "epoch": 1.285923039454457, + "grad_norm": 0.3587741255760193, + "learning_rate": 0.0002, + "loss": 1.7325, + "step": 1320 + }, + { + "epoch": 1.2956648806624451, + "grad_norm": 0.3704394996166229, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1330 + }, + { + "epoch": 1.3054067218704335, + "grad_norm": 0.38131803274154663, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1340 + }, + { + "epoch": 1.3151485630784219, + "grad_norm": 0.36109617352485657, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.32489040428641, + "grad_norm": 0.37283554673194885, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 1360 + }, + { + "epoch": 1.3346322454943984, + "grad_norm": 0.31808891892433167, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1370 + }, + { + "epoch": 1.3443740867023868, + "grad_norm": 0.3370385766029358, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1380 + }, + { + "epoch": 1.354115927910375, + "grad_norm": 0.3568558394908905, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1390 + }, + { + "epoch": 1.3638577691183633, + "grad_norm": 0.3537410497665405, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1400 + }, + { + "epoch": 1.3735996103263517, + "grad_norm": 0.3536544144153595, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 1410 + }, + { + "epoch": 1.38334145153434, + "grad_norm": 0.3772895038127899, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 1420 + }, + { + "epoch": 1.3930832927423282, + "grad_norm": 0.38079720735549927, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1430 + }, + { + "epoch": 1.4028251339503166, + "grad_norm": 0.3811109662055969, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1440 + }, + { + "epoch": 1.412566975158305, + "grad_norm": 0.38586318492889404, + "learning_rate": 0.0002, + "loss": 1.6424, + "step": 1450 + }, + { + "epoch": 1.4223088163662934, + "grad_norm": 0.3405744135379791, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1460 + }, + { + "epoch": 1.4320506575742815, + "grad_norm": 0.39527642726898193, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 1470 + }, + { + "epoch": 1.44179249878227, + "grad_norm": 0.4494728744029999, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 1480 + }, + { + "epoch": 1.4515343399902583, + "grad_norm": 0.34068453311920166, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 1490 + }, + { + "epoch": 1.4612761811982464, + "grad_norm": 0.36169710755348206, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 1500 + }, + { + "epoch": 1.4710180224062348, + "grad_norm": 0.31519418954849243, + "learning_rate": 0.0002, + "loss": 1.7189, + "step": 1510 + }, + { + "epoch": 1.4807598636142232, + "grad_norm": 0.35117292404174805, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1520 + }, + { + "epoch": 1.4905017048222113, + "grad_norm": 0.40951141715049744, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 1530 + }, + { + "epoch": 1.5002435460301997, + "grad_norm": 0.37542906403541565, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 1540 + }, + { + "epoch": 1.509985387238188, + "grad_norm": 0.35395753383636475, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1550 + }, + { + "epoch": 1.5197272284461762, + "grad_norm": 0.35497018694877625, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 1560 + }, + { + "epoch": 1.5294690696541646, + "grad_norm": 0.3693031072616577, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 1570 + }, + { + "epoch": 1.539210910862153, + "grad_norm": 0.34013301134109497, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 1.5489527520701412, + "grad_norm": 0.37312784790992737, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1590 + }, + { + "epoch": 1.5586945932781295, + "grad_norm": 0.357496440410614, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1600 + }, + { + "epoch": 1.568436434486118, + "grad_norm": 0.35192370414733887, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1610 + }, + { + "epoch": 1.578178275694106, + "grad_norm": 0.34144821763038635, + "learning_rate": 0.0002, + "loss": 1.7505, + "step": 1620 + }, + { + "epoch": 1.5879201169020944, + "grad_norm": 0.3320509195327759, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 1630 + }, + { + "epoch": 1.5976619581100828, + "grad_norm": 0.34178847074508667, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1640 + }, + { + "epoch": 1.607403799318071, + "grad_norm": 0.36567580699920654, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 1650 + }, + { + "epoch": 1.6171456405260596, + "grad_norm": 0.35599812865257263, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1660 + }, + { + "epoch": 1.6268874817340477, + "grad_norm": 0.33765384554862976, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1670 + }, + { + "epoch": 1.636629322942036, + "grad_norm": 0.33142679929733276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 1680 + }, + { + "epoch": 1.6463711641500245, + "grad_norm": 0.6959079504013062, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1690 + }, + { + "epoch": 1.6561130053580126, + "grad_norm": 0.35073819756507874, + "learning_rate": 0.0002, + "loss": 1.7665, + "step": 1700 + }, + { + "epoch": 1.665854846566001, + "grad_norm": 0.3461478352546692, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1710 + }, + { + "epoch": 1.6755966877739894, + "grad_norm": 0.3697752058506012, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 1720 + }, + { + "epoch": 1.6853385289819776, + "grad_norm": 0.3755154609680176, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1730 + }, + { + "epoch": 1.695080370189966, + "grad_norm": 0.33977627754211426, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 1740 + }, + { + "epoch": 1.7048222113979543, + "grad_norm": 0.4001041650772095, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1750 + }, + { + "epoch": 1.7145640526059425, + "grad_norm": 0.36998286843299866, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 1760 + }, + { + "epoch": 1.7243058938139308, + "grad_norm": 0.39944565296173096, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1770 + }, + { + "epoch": 1.7340477350219192, + "grad_norm": 0.4002859890460968, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1780 + }, + { + "epoch": 1.7437895762299074, + "grad_norm": 0.33336859941482544, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 1790 + }, + { + "epoch": 1.7535314174378958, + "grad_norm": 0.35853952169418335, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 1800 + }, + { + "epoch": 1.7632732586458841, + "grad_norm": 0.35876700282096863, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1810 + }, + { + "epoch": 1.7730150998538723, + "grad_norm": 0.3497968912124634, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 1820 + }, + { + "epoch": 1.7827569410618607, + "grad_norm": 0.33182016015052795, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 1830 + }, + { + "epoch": 1.792498782269849, + "grad_norm": 0.33359771966934204, + "learning_rate": 0.0002, + "loss": 1.7594, + "step": 1840 + }, + { + "epoch": 1.8022406234778372, + "grad_norm": 0.38070961833000183, + "learning_rate": 0.0002, + "loss": 1.8611, + "step": 1850 + }, + { + "epoch": 1.8119824646858256, + "grad_norm": 0.34111160039901733, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 1860 + }, + { + "epoch": 1.821724305893814, + "grad_norm": 0.4439302980899811, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 1870 + }, + { + "epoch": 1.8314661471018021, + "grad_norm": 0.37065210938453674, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 1880 + }, + { + "epoch": 1.8412079883097907, + "grad_norm": 0.33630406856536865, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 1890 + }, + { + "epoch": 1.8509498295177789, + "grad_norm": 0.334553986787796, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 1900 + }, + { + "epoch": 1.860691670725767, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1910 + }, + { + "epoch": 1.8704335119337556, + "grad_norm": 0.4307343363761902, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 1920 + }, + { + "epoch": 1.8801753531417438, + "grad_norm": 0.455602765083313, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1930 + }, + { + "epoch": 1.8899171943497322, + "grad_norm": 0.35242316126823425, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 1940 + }, + { + "epoch": 1.8996590355577205, + "grad_norm": 0.3589116632938385, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 1950 + }, + { + "epoch": 1.9094008767657087, + "grad_norm": 0.3540741801261902, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1960 + }, + { + "epoch": 1.919142717973697, + "grad_norm": 0.3547612428665161, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 1970 + }, + { + "epoch": 1.9288845591816854, + "grad_norm": 0.3485773503780365, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1980 + }, + { + "epoch": 1.9386264003896736, + "grad_norm": 0.3560304641723633, + "learning_rate": 0.0002, + "loss": 1.7301, + "step": 1990 + }, + { + "epoch": 1.948368241597662, + "grad_norm": 0.33299335837364197, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 2000 + }, + { + "epoch": 1.9581100828056504, + "grad_norm": 0.35622233152389526, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2010 + }, + { + "epoch": 1.9678519240136385, + "grad_norm": 0.3681301474571228, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 2020 + }, + { + "epoch": 1.9775937652216269, + "grad_norm": 0.36158084869384766, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2030 + }, + { + "epoch": 1.9873356064296153, + "grad_norm": 0.32560569047927856, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 2040 + }, + { + "epoch": 1.9970774476376034, + "grad_norm": 0.37404149770736694, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 2050 + }, + { + "epoch": 2.0, + "eval_loss": 1.8119343519210815, + "eval_runtime": 96.0045, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 0.667, + "step": 2053 + }, + { + "epoch": 2.006819288845592, + "grad_norm": 0.374188631772995, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 2060 + }, + { + "epoch": 2.01656113005358, + "grad_norm": 0.421764075756073, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2070 + }, + { + "epoch": 2.0263029712615683, + "grad_norm": 0.43841829895973206, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2080 + }, + { + "epoch": 2.036044812469557, + "grad_norm": 0.42298218607902527, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 2090 + }, + { + "epoch": 2.045786653677545, + "grad_norm": 0.43669602274894714, + "learning_rate": 0.0002, + "loss": 1.5883, + "step": 2100 + }, + { + "epoch": 2.0555284948855332, + "grad_norm": 0.4080469012260437, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 2110 + }, + { + "epoch": 2.065270336093522, + "grad_norm": 0.483192503452301, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2120 + }, + { + "epoch": 2.07501217730151, + "grad_norm": 0.44427400827407837, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 2130 + }, + { + "epoch": 2.084754018509498, + "grad_norm": 0.48835131525993347, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 2140 + }, + { + "epoch": 2.0944958597174868, + "grad_norm": 0.42733684182167053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 2150 + }, + { + "epoch": 2.104237700925475, + "grad_norm": 0.4258694648742676, + "learning_rate": 0.0002, + "loss": 1.5562, + "step": 2160 + }, + { + "epoch": 2.113979542133463, + "grad_norm": 0.5164985656738281, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 2170 + }, + { + "epoch": 2.1237213833414517, + "grad_norm": 0.4279228150844574, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 2180 + }, + { + "epoch": 2.13346322454944, + "grad_norm": 0.48209506273269653, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2190 + }, + { + "epoch": 2.143205065757428, + "grad_norm": 0.4071785509586334, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 2200 + }, + { + "epoch": 2.1529469069654166, + "grad_norm": 0.4629398584365845, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 2210 + }, + { + "epoch": 2.1626887481734047, + "grad_norm": 0.44390997290611267, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2220 + }, + { + "epoch": 2.172430589381393, + "grad_norm": 0.46886971592903137, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 2230 + }, + { + "epoch": 2.1821724305893815, + "grad_norm": 0.43745434284210205, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 2240 + }, + { + "epoch": 2.1919142717973696, + "grad_norm": 0.42737245559692383, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 2250 + }, + { + "epoch": 2.201656113005358, + "grad_norm": 0.5028428435325623, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 2260 + }, + { + "epoch": 2.2113979542133464, + "grad_norm": 0.48987212777137756, + "learning_rate": 0.0002, + "loss": 1.6148, + "step": 2270 + }, + { + "epoch": 2.2211397954213346, + "grad_norm": 0.48186370730400085, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 2280 + }, + { + "epoch": 2.2308816366293227, + "grad_norm": 0.4417429566383362, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 2290 + }, + { + "epoch": 2.2406234778373113, + "grad_norm": 0.4757710099220276, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2300 + }, + { + "epoch": 2.2503653190452995, + "grad_norm": 0.44449448585510254, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2310 + }, + { + "epoch": 2.260107160253288, + "grad_norm": 0.5070863962173462, + "learning_rate": 0.0002, + "loss": 1.5742, + "step": 2320 + }, + { + "epoch": 2.269849001461276, + "grad_norm": 0.4967133700847626, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2330 + }, + { + "epoch": 2.2795908426692644, + "grad_norm": 0.5110220909118652, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2340 + }, + { + "epoch": 2.289332683877253, + "grad_norm": 0.47984135150909424, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2350 + }, + { + "epoch": 2.299074525085241, + "grad_norm": 0.5005794763565063, + "learning_rate": 0.0002, + "loss": 1.5927, + "step": 2360 + }, + { + "epoch": 2.3088163662932293, + "grad_norm": 0.4991425573825836, + "learning_rate": 0.0002, + "loss": 1.6131, + "step": 2370 + }, + { + "epoch": 2.318558207501218, + "grad_norm": 0.4948616623878479, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2380 + }, + { + "epoch": 2.328300048709206, + "grad_norm": 0.4533160328865051, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 2390 + }, + { + "epoch": 2.338041889917194, + "grad_norm": 0.5871071219444275, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 2400 + }, + { + "epoch": 2.347783731125183, + "grad_norm": 0.5048075914382935, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 2.357525572333171, + "grad_norm": 0.4973750412464142, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 2420 + }, + { + "epoch": 2.367267413541159, + "grad_norm": 0.48294538259506226, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 2430 + }, + { + "epoch": 2.3770092547491477, + "grad_norm": 0.7180454134941101, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 2440 + }, + { + "epoch": 2.386751095957136, + "grad_norm": 0.4627632796764374, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 2450 + }, + { + "epoch": 2.396492937165124, + "grad_norm": 0.4834378957748413, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 2460 + }, + { + "epoch": 2.4062347783731126, + "grad_norm": 0.5173670649528503, + "learning_rate": 0.0002, + "loss": 1.6145, + "step": 2470 + }, + { + "epoch": 2.4159766195811008, + "grad_norm": 0.49652737379074097, + "learning_rate": 0.0002, + "loss": 1.5464, + "step": 2480 + }, + { + "epoch": 2.4257184607890894, + "grad_norm": 0.47052669525146484, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 2490 + }, + { + "epoch": 2.4354603019970775, + "grad_norm": 0.5188006162643433, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 2500 + }, + { + "epoch": 2.4452021432050657, + "grad_norm": 0.5010119676589966, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 2510 + }, + { + "epoch": 2.4549439844130543, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2520 + }, + { + "epoch": 2.4646858256210424, + "grad_norm": 0.5292699337005615, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2530 + }, + { + "epoch": 2.4744276668290306, + "grad_norm": 0.48555099964141846, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 2540 + }, + { + "epoch": 2.484169508037019, + "grad_norm": 0.4764043092727661, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 2550 + }, + { + "epoch": 2.4939113492450073, + "grad_norm": 0.47839659452438354, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 2560 + }, + { + "epoch": 2.5036531904529955, + "grad_norm": 0.4514436721801758, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 2570 + }, + { + "epoch": 2.513395031660984, + "grad_norm": 0.5681955218315125, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2580 + }, + { + "epoch": 2.5231368728689723, + "grad_norm": 0.49655985832214355, + "learning_rate": 0.0002, + "loss": 1.5976, + "step": 2590 + }, + { + "epoch": 2.5328787140769604, + "grad_norm": 0.5077657103538513, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2600 + }, + { + "epoch": 2.542620555284949, + "grad_norm": 0.5643279552459717, + "learning_rate": 0.0002, + "loss": 1.5658, + "step": 2610 + }, + { + "epoch": 2.552362396492937, + "grad_norm": 0.4715031087398529, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 2620 + }, + { + "epoch": 2.5621042377009253, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2630 + }, + { + "epoch": 2.571846078908914, + "grad_norm": 0.49469611048698425, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2640 + }, + { + "epoch": 2.581587920116902, + "grad_norm": 0.4567806124687195, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 2650 + }, + { + "epoch": 2.5913297613248902, + "grad_norm": 0.5357107520103455, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 2660 + }, + { + "epoch": 2.601071602532879, + "grad_norm": 0.46977677941322327, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 2670 + }, + { + "epoch": 2.610813443740867, + "grad_norm": 0.6626771092414856, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 2680 + }, + { + "epoch": 2.620555284948855, + "grad_norm": 0.4587472081184387, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 2690 + }, + { + "epoch": 2.6302971261568437, + "grad_norm": 0.4816797077655792, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 2700 + }, + { + "epoch": 2.640038967364832, + "grad_norm": 0.4856809675693512, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 2710 + }, + { + "epoch": 2.64978080857282, + "grad_norm": 0.46010780334472656, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 2720 + }, + { + "epoch": 2.6595226497808087, + "grad_norm": 0.4637954533100128, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 2730 + }, + { + "epoch": 2.669264490988797, + "grad_norm": 0.5954997539520264, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 2740 + }, + { + "epoch": 2.679006332196785, + "grad_norm": 0.5071861743927002, + "learning_rate": 0.0002, + "loss": 1.5795, + "step": 2750 + }, + { + "epoch": 2.6887481734047736, + "grad_norm": 0.5415477156639099, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.6984900146127617, + "grad_norm": 0.5618549585342407, + "learning_rate": 0.0002, + "loss": 1.5476, + "step": 2770 + }, + { + "epoch": 2.70823185582075, + "grad_norm": 0.49338817596435547, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 2780 + }, + { + "epoch": 2.7179736970287385, + "grad_norm": 0.5149586796760559, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2790 + }, + { + "epoch": 2.7277155382367266, + "grad_norm": 0.6247242093086243, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2800 + }, + { + "epoch": 2.737457379444715, + "grad_norm": 0.4749542474746704, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 2810 + }, + { + "epoch": 2.7471992206527034, + "grad_norm": 0.4979191720485687, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 2820 + }, + { + "epoch": 2.7569410618606915, + "grad_norm": 0.4885074198246002, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2830 + }, + { + "epoch": 2.76668290306868, + "grad_norm": 0.5047747492790222, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 2840 + }, + { + "epoch": 2.7764247442766683, + "grad_norm": 0.5280140042304993, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2850 + }, + { + "epoch": 2.7861665854846565, + "grad_norm": 0.477668434381485, + "learning_rate": 0.0002, + "loss": 1.5639, + "step": 2860 + }, + { + "epoch": 2.795908426692645, + "grad_norm": 0.4816327393054962, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2870 + }, + { + "epoch": 2.805650267900633, + "grad_norm": 0.523259162902832, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 2880 + }, + { + "epoch": 2.8153921091086214, + "grad_norm": 0.5045270919799805, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2890 + }, + { + "epoch": 2.82513395031661, + "grad_norm": 0.47986042499542236, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2900 + }, + { + "epoch": 2.834875791524598, + "grad_norm": 0.4858797490596771, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 2910 + }, + { + "epoch": 2.8446176327325867, + "grad_norm": 0.5261512398719788, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 2920 + }, + { + "epoch": 2.854359473940575, + "grad_norm": 0.630550742149353, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2930 + }, + { + "epoch": 2.864101315148563, + "grad_norm": 0.49119752645492554, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2940 + }, + { + "epoch": 2.8738431563565516, + "grad_norm": 0.4779070317745209, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2950 + }, + { + "epoch": 2.88358499756454, + "grad_norm": 0.5059782266616821, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 2960 + }, + { + "epoch": 2.893326838772528, + "grad_norm": 0.5466655492782593, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2970 + }, + { + "epoch": 2.9030686799805165, + "grad_norm": 0.4865640103816986, + "learning_rate": 0.0002, + "loss": 1.619, + "step": 2980 + }, + { + "epoch": 2.9128105211885047, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 2990 + }, + { + "epoch": 2.922552362396493, + "grad_norm": 0.9112305641174316, + "learning_rate": 0.0002, + "loss": 1.6216, + "step": 3000 + }, + { + "epoch": 2.9322942036044815, + "grad_norm": 0.3938814103603363, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 3010 + }, + { + "epoch": 2.9420360448124696, + "grad_norm": 0.5500800609588623, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 3020 + }, + { + "epoch": 2.9517778860204578, + "grad_norm": 0.5346390604972839, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3030 + }, + { + "epoch": 2.9615197272284464, + "grad_norm": 0.5245014429092407, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3040 + }, + { + "epoch": 2.9712615684364345, + "grad_norm": 0.4906884431838989, + "learning_rate": 0.0002, + "loss": 1.5549, + "step": 3050 + }, + { + "epoch": 2.9810034096444227, + "grad_norm": 0.47086769342422485, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 3060 + }, + { + "epoch": 2.9907452508524113, + "grad_norm": 0.5290229320526123, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3070 + }, + { + "epoch": 2.9995129079396006, + "eval_loss": 1.8463934659957886, + "eval_runtime": 56.2401, + "eval_samples_per_second": 9.015, + "eval_steps_per_second": 1.138, + "step": 3079 + }, + { + "epoch": 3.0004870920603994, + "grad_norm": 0.49992576241493225, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 3080 + }, + { + "epoch": 3.0102289332683876, + "grad_norm": 0.8242783546447754, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3090 + }, + { + "epoch": 3.019970774476376, + "grad_norm": 0.6330569386482239, + "learning_rate": 0.0002, + "loss": 1.394, + "step": 3100 + }, + { + "epoch": 3.0297126156843643, + "grad_norm": 0.566097617149353, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 3110 + }, + { + "epoch": 3.0394544568923525, + "grad_norm": 0.6337586045265198, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 3120 + }, + { + "epoch": 3.049196298100341, + "grad_norm": 0.7339403033256531, + "learning_rate": 0.0002, + "loss": 1.3916, + "step": 3130 + }, + { + "epoch": 3.0589381393083293, + "grad_norm": 0.7187346816062927, + "learning_rate": 0.0002, + "loss": 1.4617, + "step": 3140 + }, + { + "epoch": 3.0686799805163174, + "grad_norm": 0.7116255760192871, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3150 + }, + { + "epoch": 3.078421821724306, + "grad_norm": 0.6493807435035706, + "learning_rate": 0.0002, + "loss": 1.4452, + "step": 3160 + }, + { + "epoch": 3.088163662932294, + "grad_norm": 0.6777266263961792, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 3170 + }, + { + "epoch": 3.0979055041402823, + "grad_norm": 0.6342006325721741, + "learning_rate": 0.0002, + "loss": 1.4362, + "step": 3180 + }, + { + "epoch": 3.107647345348271, + "grad_norm": 0.6608964204788208, + "learning_rate": 0.0002, + "loss": 1.4748, + "step": 3190 + }, + { + "epoch": 3.117389186556259, + "grad_norm": 0.7230247259140015, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 3200 + }, + { + "epoch": 3.1271310277642472, + "grad_norm": 0.650368332862854, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 3210 + }, + { + "epoch": 3.136872868972236, + "grad_norm": 0.7319342494010925, + "learning_rate": 0.0002, + "loss": 1.409, + "step": 3220 + }, + { + "epoch": 3.146614710180224, + "grad_norm": 0.7159963846206665, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 3230 + }, + { + "epoch": 3.156356551388212, + "grad_norm": 0.8905230164527893, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 3240 + }, + { + "epoch": 3.1660983925962007, + "grad_norm": 0.6920804381370544, + "learning_rate": 0.0002, + "loss": 1.3161, + "step": 3250 + }, + { + "epoch": 3.175840233804189, + "grad_norm": 0.6782063841819763, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 3260 + }, + { + "epoch": 3.1855820750121775, + "grad_norm": 0.735325276851654, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3270 + }, + { + "epoch": 3.1953239162201656, + "grad_norm": 0.6657978296279907, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 3280 + }, + { + "epoch": 3.205065757428154, + "grad_norm": 0.771315336227417, + "learning_rate": 0.0002, + "loss": 1.3456, + "step": 3290 + }, + { + "epoch": 3.2148075986361424, + "grad_norm": 0.6492983102798462, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 3300 + }, + { + "epoch": 3.2245494398441306, + "grad_norm": 0.7513770461082458, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 3310 + }, + { + "epoch": 3.2342912810521187, + "grad_norm": 0.7091423869132996, + "learning_rate": 0.0002, + "loss": 1.4032, + "step": 3320 + }, + { + "epoch": 3.2440331222601073, + "grad_norm": 0.6663975119590759, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 3330 + }, + { + "epoch": 3.2537749634680955, + "grad_norm": 0.6813122034072876, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 3340 + }, + { + "epoch": 3.2635168046760836, + "grad_norm": 0.6602569818496704, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 3350 + }, + { + "epoch": 3.2732586458840722, + "grad_norm": 0.718270480632782, + "learning_rate": 0.0002, + "loss": 1.4533, + "step": 3360 + }, + { + "epoch": 3.2830004870920604, + "grad_norm": 0.6884173154830933, + "learning_rate": 0.0002, + "loss": 1.4076, + "step": 3370 + }, + { + "epoch": 3.2927423283000485, + "grad_norm": 0.7039775848388672, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3380 + }, + { + "epoch": 3.302484169508037, + "grad_norm": 0.7444299459457397, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3390 + }, + { + "epoch": 3.3122260107160253, + "grad_norm": 0.7187064290046692, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 3400 + }, + { + "epoch": 3.3219678519240134, + "grad_norm": 0.599396288394928, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3410 + }, + { + "epoch": 3.331709693132002, + "grad_norm": 0.7670390009880066, + "learning_rate": 0.0002, + "loss": 1.4819, + "step": 3420 + }, + { + "epoch": 3.34145153433999, + "grad_norm": 0.6654478311538696, + "learning_rate": 0.0002, + "loss": 1.4411, + "step": 3430 + }, + { + "epoch": 3.351193375547979, + "grad_norm": 0.6644385457038879, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 3440 + }, + { + "epoch": 3.360935216755967, + "grad_norm": 0.6974098086357117, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3450 + }, + { + "epoch": 3.370677057963955, + "grad_norm": 0.7350399494171143, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 3460 + }, + { + "epoch": 3.3804188991719437, + "grad_norm": 0.714721143245697, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 3470 + }, + { + "epoch": 3.390160740379932, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0002, + "loss": 1.4325, + "step": 3480 + }, + { + "epoch": 3.39990258158792, + "grad_norm": 0.6767925024032593, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3490 + }, + { + "epoch": 3.4096444227959086, + "grad_norm": 0.6721355319023132, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 3500 + }, + { + "epoch": 3.419386264003897, + "grad_norm": 0.6845725178718567, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3510 + }, + { + "epoch": 3.429128105211885, + "grad_norm": 0.6882196664810181, + "learning_rate": 0.0002, + "loss": 1.4832, + "step": 3520 + }, + { + "epoch": 3.4388699464198735, + "grad_norm": 0.7663240432739258, + "learning_rate": 0.0002, + "loss": 1.4962, + "step": 3530 + }, + { + "epoch": 3.4486117876278617, + "grad_norm": 0.6304219365119934, + "learning_rate": 0.0002, + "loss": 1.4644, + "step": 3540 + }, + { + "epoch": 3.45835362883585, + "grad_norm": 0.668678879737854, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3550 + }, + { + "epoch": 3.4680954700438384, + "grad_norm": 0.7526912093162537, + "learning_rate": 0.0002, + "loss": 1.4874, + "step": 3560 + }, + { + "epoch": 3.4778373112518266, + "grad_norm": 1.089495301246643, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 3570 + }, + { + "epoch": 3.4875791524598148, + "grad_norm": 0.7282902002334595, + "learning_rate": 0.0002, + "loss": 1.3871, + "step": 3580 + }, + { + "epoch": 3.4973209936678034, + "grad_norm": 0.6540156602859497, + "learning_rate": 0.0002, + "loss": 1.5077, + "step": 3590 + }, + { + "epoch": 3.5070628348757915, + "grad_norm": 0.6449568867683411, + "learning_rate": 0.0002, + "loss": 1.4367, + "step": 3600 + }, + { + "epoch": 3.5168046760837797, + "grad_norm": 0.7262216210365295, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 3610 + }, + { + "epoch": 3.5265465172917683, + "grad_norm": 0.6048615574836731, + "learning_rate": 0.0002, + "loss": 1.4374, + "step": 3620 + }, + { + "epoch": 3.5362883584997564, + "grad_norm": 0.6780537366867065, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 3630 + }, + { + "epoch": 3.5460301997077446, + "grad_norm": 0.6851925253868103, + "learning_rate": 0.0002, + "loss": 1.422, + "step": 3640 + }, + { + "epoch": 3.555772040915733, + "grad_norm": 0.6530634164810181, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3650 + }, + { + "epoch": 3.5655138821237213, + "grad_norm": 0.7193992733955383, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3660 + }, + { + "epoch": 3.5752557233317095, + "grad_norm": 0.767496645450592, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 3670 + }, + { + "epoch": 3.584997564539698, + "grad_norm": 0.6912919282913208, + "learning_rate": 0.0002, + "loss": 1.4824, + "step": 3680 + }, + { + "epoch": 3.5947394057476862, + "grad_norm": 0.7383436560630798, + "learning_rate": 0.0002, + "loss": 1.4497, + "step": 3690 + }, + { + "epoch": 3.6044812469556744, + "grad_norm": 0.6746662855148315, + "learning_rate": 0.0002, + "loss": 1.4822, + "step": 3700 + }, + { + "epoch": 3.614223088163663, + "grad_norm": 0.6885138750076294, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3710 + }, + { + "epoch": 3.623964929371651, + "grad_norm": 0.6694392561912537, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3720 + }, + { + "epoch": 3.6337067705796393, + "grad_norm": 0.812358021736145, + "learning_rate": 0.0002, + "loss": 1.3719, + "step": 3730 + }, + { + "epoch": 3.643448611787628, + "grad_norm": 0.7267130017280579, + "learning_rate": 0.0002, + "loss": 1.4603, + "step": 3740 + }, + { + "epoch": 3.653190452995616, + "grad_norm": 0.6958749294281006, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3750 + }, + { + "epoch": 3.6629322942036042, + "grad_norm": 0.6805673241615295, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3760 + }, + { + "epoch": 3.672674135411593, + "grad_norm": 0.7184410095214844, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3770 + }, + { + "epoch": 3.682415976619581, + "grad_norm": 0.7716330289840698, + "learning_rate": 0.0002, + "loss": 1.3935, + "step": 3780 + }, + { + "epoch": 3.6921578178275696, + "grad_norm": 0.6675831079483032, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3790 + }, + { + "epoch": 3.7018996590355577, + "grad_norm": 0.6480095386505127, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 3800 + }, + { + "epoch": 3.711641500243546, + "grad_norm": 0.6559418439865112, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 3810 + }, + { + "epoch": 3.7213833414515345, + "grad_norm": 0.6596545577049255, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 3820 + }, + { + "epoch": 3.7311251826595226, + "grad_norm": 0.7172950506210327, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3830 + }, + { + "epoch": 3.740867023867511, + "grad_norm": 0.796148419380188, + "learning_rate": 0.0002, + "loss": 1.446, + "step": 3840 + }, + { + "epoch": 3.7506088650754994, + "grad_norm": 0.6600322723388672, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 3850 + }, + { + "epoch": 3.7603507062834876, + "grad_norm": 0.6776387691497803, + "learning_rate": 0.0002, + "loss": 1.4201, + "step": 3860 + }, + { + "epoch": 3.770092547491476, + "grad_norm": 0.7768304347991943, + "learning_rate": 0.0002, + "loss": 1.3893, + "step": 3870 + }, + { + "epoch": 3.7798343886994643, + "grad_norm": 1.0579794645309448, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 3880 + }, + { + "epoch": 3.7895762299074525, + "grad_norm": 0.6757252812385559, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 3890 + }, + { + "epoch": 3.799318071115441, + "grad_norm": 0.6706996560096741, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3900 + }, + { + "epoch": 3.809059912323429, + "grad_norm": 0.7026948928833008, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 3910 + }, + { + "epoch": 3.8188017535314174, + "grad_norm": 0.6437768340110779, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 3920 + }, + { + "epoch": 3.828543594739406, + "grad_norm": 0.7015706300735474, + "learning_rate": 0.0002, + "loss": 1.4678, + "step": 3930 + }, + { + "epoch": 3.838285435947394, + "grad_norm": 0.7049482464790344, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 3940 + }, + { + "epoch": 3.8480272771553823, + "grad_norm": 0.6533724665641785, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3950 + }, + { + "epoch": 3.857769118363371, + "grad_norm": 0.7312499284744263, + "learning_rate": 0.0002, + "loss": 1.4435, + "step": 3960 + }, + { + "epoch": 3.867510959571359, + "grad_norm": 0.6858801245689392, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 3970 + }, + { + "epoch": 3.877252800779347, + "grad_norm": 0.770423173904419, + "learning_rate": 0.0002, + "loss": 1.4423, + "step": 3980 + }, + { + "epoch": 3.886994641987336, + "grad_norm": 0.6987539529800415, + "learning_rate": 0.0002, + "loss": 1.5029, + "step": 3990 + }, + { + "epoch": 3.896736483195324, + "grad_norm": 0.7072722315788269, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 4000 + }, + { + "epoch": 3.906478324403312, + "grad_norm": 0.6492931842803955, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4010 + }, + { + "epoch": 3.9162201656113007, + "grad_norm": 0.7716232538223267, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 4020 + }, + { + "epoch": 3.925962006819289, + "grad_norm": 0.722949743270874, + "learning_rate": 0.0002, + "loss": 1.4758, + "step": 4030 + }, + { + "epoch": 3.935703848027277, + "grad_norm": 0.7434365749359131, + "learning_rate": 0.0002, + "loss": 1.3914, + "step": 4040 + }, + { + "epoch": 3.9454456892352656, + "grad_norm": 0.6691509485244751, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 4050 + }, + { + "epoch": 3.9551875304432538, + "grad_norm": 0.6850284337997437, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4060 + }, + { + "epoch": 3.964929371651242, + "grad_norm": 0.6954452991485596, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 4070 + }, + { + "epoch": 3.9746712128592305, + "grad_norm": 0.9316364526748657, + "learning_rate": 0.0002, + "loss": 1.417, + "step": 4080 + }, + { + "epoch": 3.9844130540672187, + "grad_norm": 0.6908289194107056, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 4090 + }, + { + "epoch": 3.994154895275207, + "grad_norm": 0.666782021522522, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 4100 + }, + { + "epoch": 4.0, + "eval_loss": 1.9233275651931763, + "eval_runtime": 55.9536, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 1.144, + "step": 4106 + }, + { + "epoch": 4.003896736483195, + "grad_norm": 0.7726166248321533, + "learning_rate": 0.0002, + "loss": 1.3489, + "step": 4110 + }, + { + "epoch": 4.013638577691184, + "grad_norm": 1.1338967084884644, + "learning_rate": 0.0002, + "loss": 1.1415, + "step": 4120 + }, + { + "epoch": 4.023380418899172, + "grad_norm": 0.9530029296875, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 4130 + }, + { + "epoch": 4.03312226010716, + "grad_norm": 1.1058554649353027, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 4140 + }, + { + "epoch": 4.042864101315149, + "grad_norm": 0.8765049576759338, + "learning_rate": 0.0002, + "loss": 1.2381, + "step": 4150 + }, + { + "epoch": 4.052605942523137, + "grad_norm": 1.1774667501449585, + "learning_rate": 0.0002, + "loss": 1.2708, + "step": 4160 + }, + { + "epoch": 4.062347783731125, + "grad_norm": 0.9301433563232422, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 4170 + }, + { + "epoch": 4.072089624939114, + "grad_norm": 1.0196778774261475, + "learning_rate": 0.0002, + "loss": 1.1807, + "step": 4180 + }, + { + "epoch": 4.081831466147102, + "grad_norm": 1.1380577087402344, + "learning_rate": 0.0002, + "loss": 1.2602, + "step": 4190 + }, + { + "epoch": 4.09157330735509, + "grad_norm": 0.9121319651603699, + "learning_rate": 0.0002, + "loss": 1.2521, + "step": 4200 + }, + { + "epoch": 4.101315148563079, + "grad_norm": 0.9495378732681274, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 4210 + }, + { + "epoch": 4.1110569897710665, + "grad_norm": 0.8058680295944214, + "learning_rate": 0.0002, + "loss": 1.1829, + "step": 4220 + }, + { + "epoch": 4.120798830979055, + "grad_norm": 1.000887393951416, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 4230 + }, + { + "epoch": 4.130540672187044, + "grad_norm": 0.9529102444648743, + "learning_rate": 0.0002, + "loss": 1.1947, + "step": 4240 + }, + { + "epoch": 4.140282513395031, + "grad_norm": 1.0257115364074707, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 4250 + }, + { + "epoch": 4.15002435460302, + "grad_norm": 0.9590303897857666, + "learning_rate": 0.0002, + "loss": 1.2293, + "step": 4260 + }, + { + "epoch": 4.159766195811009, + "grad_norm": 1.065291166305542, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 4270 + }, + { + "epoch": 4.169508037018996, + "grad_norm": 0.8819697499275208, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4280 + }, + { + "epoch": 4.179249878226985, + "grad_norm": 1.0335261821746826, + "learning_rate": 0.0002, + "loss": 1.2167, + "step": 4290 + }, + { + "epoch": 4.1889917194349735, + "grad_norm": 0.8872809410095215, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 4300 + }, + { + "epoch": 4.198733560642961, + "grad_norm": 0.9883159399032593, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 4310 + }, + { + "epoch": 4.20847540185095, + "grad_norm": 1.0254192352294922, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 4320 + }, + { + "epoch": 4.218217243058938, + "grad_norm": 0.9432600736618042, + "learning_rate": 0.0002, + "loss": 1.2595, + "step": 4330 + }, + { + "epoch": 4.227959084266926, + "grad_norm": 1.1008676290512085, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4340 + }, + { + "epoch": 4.237700925474915, + "grad_norm": 1.0829699039459229, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 4350 + }, + { + "epoch": 4.247442766682903, + "grad_norm": 1.016847848892212, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4360 + }, + { + "epoch": 4.257184607890891, + "grad_norm": 0.8924864530563354, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 4370 + }, + { + "epoch": 4.26692644909888, + "grad_norm": 0.9300530552864075, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 4380 + }, + { + "epoch": 4.276668290306868, + "grad_norm": 0.9684814810752869, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 4390 + }, + { + "epoch": 4.286410131514856, + "grad_norm": 0.9916250705718994, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 4400 + }, + { + "epoch": 4.2961519727228445, + "grad_norm": 0.903680145740509, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 4410 + }, + { + "epoch": 4.305893813930833, + "grad_norm": 0.8713505268096924, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 4420 + }, + { + "epoch": 4.315635655138821, + "grad_norm": 0.9983905553817749, + "learning_rate": 0.0002, + "loss": 1.1957, + "step": 4430 + }, + { + "epoch": 4.3253774963468095, + "grad_norm": 1.1689040660858154, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 4440 + }, + { + "epoch": 4.335119337554798, + "grad_norm": 0.9316853880882263, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 4450 + }, + { + "epoch": 4.344861178762786, + "grad_norm": 0.9175887107849121, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 4460 + }, + { + "epoch": 4.354603019970774, + "grad_norm": 0.9348906874656677, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4470 + }, + { + "epoch": 4.364344861178763, + "grad_norm": 0.9727016687393188, + "learning_rate": 0.0002, + "loss": 1.2764, + "step": 4480 + }, + { + "epoch": 4.374086702386751, + "grad_norm": 0.9843429923057556, + "learning_rate": 0.0002, + "loss": 1.2616, + "step": 4490 + }, + { + "epoch": 4.383828543594739, + "grad_norm": 0.9615852236747742, + "learning_rate": 0.0002, + "loss": 1.2488, + "step": 4500 + }, + { + "epoch": 4.393570384802728, + "grad_norm": 0.9688583612442017, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 4510 + }, + { + "epoch": 4.403312226010716, + "grad_norm": 0.9933668375015259, + "learning_rate": 0.0002, + "loss": 1.2546, + "step": 4520 + }, + { + "epoch": 4.413054067218704, + "grad_norm": 1.0626686811447144, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4530 + }, + { + "epoch": 4.422795908426693, + "grad_norm": 0.9536267518997192, + "learning_rate": 0.0002, + "loss": 1.2425, + "step": 4540 + }, + { + "epoch": 4.432537749634681, + "grad_norm": 0.9777140021324158, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 4550 + }, + { + "epoch": 4.442279590842669, + "grad_norm": 0.980780839920044, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 4560 + }, + { + "epoch": 4.452021432050658, + "grad_norm": 1.0147196054458618, + "learning_rate": 0.0002, + "loss": 1.2597, + "step": 4570 + }, + { + "epoch": 4.461763273258645, + "grad_norm": 0.9763361811637878, + "learning_rate": 0.0002, + "loss": 1.2148, + "step": 4580 + }, + { + "epoch": 4.471505114466634, + "grad_norm": 1.0300798416137695, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 4590 + }, + { + "epoch": 4.481246955674623, + "grad_norm": 0.8833121657371521, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 4600 + }, + { + "epoch": 4.490988796882611, + "grad_norm": 1.1214020252227783, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 4610 + }, + { + "epoch": 4.500730638090599, + "grad_norm": 0.8843787908554077, + "learning_rate": 0.0002, + "loss": 1.2579, + "step": 4620 + }, + { + "epoch": 4.5104724792985875, + "grad_norm": 0.9942020773887634, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 4630 + }, + { + "epoch": 4.520214320506576, + "grad_norm": 1.0033202171325684, + "learning_rate": 0.0002, + "loss": 1.3172, + "step": 4640 + }, + { + "epoch": 4.529956161714564, + "grad_norm": 0.8767235279083252, + "learning_rate": 0.0002, + "loss": 1.2024, + "step": 4650 + }, + { + "epoch": 4.539698002922552, + "grad_norm": 1.0117276906967163, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 4660 + }, + { + "epoch": 4.549439844130541, + "grad_norm": 1.2787362337112427, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4670 + }, + { + "epoch": 4.559181685338529, + "grad_norm": 0.8824878931045532, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 4680 + }, + { + "epoch": 4.568923526546517, + "grad_norm": 0.9209560751914978, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 4690 + }, + { + "epoch": 4.578665367754506, + "grad_norm": 1.1064010858535767, + "learning_rate": 0.0002, + "loss": 1.1916, + "step": 4700 + }, + { + "epoch": 4.588407208962494, + "grad_norm": 0.8914572596549988, + "learning_rate": 0.0002, + "loss": 1.2217, + "step": 4710 + }, + { + "epoch": 4.598149050170482, + "grad_norm": 1.0412265062332153, + "learning_rate": 0.0002, + "loss": 1.2861, + "step": 4720 + }, + { + "epoch": 4.607890891378471, + "grad_norm": 1.1950221061706543, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 4730 + }, + { + "epoch": 4.617632732586459, + "grad_norm": 0.8938062787055969, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 4740 + }, + { + "epoch": 4.627374573794447, + "grad_norm": 0.9849569201469421, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 4750 + }, + { + "epoch": 4.637116415002436, + "grad_norm": 1.0081515312194824, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4760 + }, + { + "epoch": 4.6468582562104235, + "grad_norm": 0.8566309213638306, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 4770 + }, + { + "epoch": 4.656600097418412, + "grad_norm": 1.1750118732452393, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 4780 + }, + { + "epoch": 4.666341938626401, + "grad_norm": 0.925502598285675, + "learning_rate": 0.0002, + "loss": 1.2537, + "step": 4790 + }, + { + "epoch": 4.676083779834388, + "grad_norm": 1.0402472019195557, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 4800 + }, + { + "epoch": 4.685825621042377, + "grad_norm": 0.9772472977638245, + "learning_rate": 0.0002, + "loss": 1.2555, + "step": 4810 + }, + { + "epoch": 4.695567462250366, + "grad_norm": 0.9082779288291931, + "learning_rate": 0.0002, + "loss": 1.2667, + "step": 4820 + }, + { + "epoch": 4.705309303458353, + "grad_norm": 0.8026862740516663, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 4830 + }, + { + "epoch": 4.715051144666342, + "grad_norm": 1.1631089448928833, + "learning_rate": 0.0002, + "loss": 1.3369, + "step": 4840 + }, + { + "epoch": 4.7247929858743305, + "grad_norm": 0.9384787678718567, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4850 + }, + { + "epoch": 4.734534827082318, + "grad_norm": 1.2151581048965454, + "learning_rate": 0.0002, + "loss": 1.2588, + "step": 4860 + }, + { + "epoch": 4.744276668290307, + "grad_norm": 0.9679436087608337, + "learning_rate": 0.0002, + "loss": 1.363, + "step": 4870 + }, + { + "epoch": 4.754018509498295, + "grad_norm": 0.8352158069610596, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 4880 + }, + { + "epoch": 4.763760350706283, + "grad_norm": 1.0205804109573364, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 4890 + }, + { + "epoch": 4.773502191914272, + "grad_norm": 0.9814772605895996, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 4900 + }, + { + "epoch": 4.78324403312226, + "grad_norm": 1.002854347229004, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 4910 + }, + { + "epoch": 4.792985874330248, + "grad_norm": 1.1609505414962769, + "learning_rate": 0.0002, + "loss": 1.3143, + "step": 4920 + }, + { + "epoch": 4.802727715538237, + "grad_norm": 0.9354982376098633, + "learning_rate": 0.0002, + "loss": 1.3166, + "step": 4930 + }, + { + "epoch": 4.812469556746225, + "grad_norm": 0.9761685729026794, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 4940 + }, + { + "epoch": 4.822211397954213, + "grad_norm": 1.0604596138000488, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 4950 + }, + { + "epoch": 4.8319532391622015, + "grad_norm": 1.0902808904647827, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 4960 + }, + { + "epoch": 4.84169508037019, + "grad_norm": 1.0174955129623413, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4970 + }, + { + "epoch": 4.851436921578179, + "grad_norm": 1.0995253324508667, + "learning_rate": 0.0002, + "loss": 1.3141, + "step": 4980 + }, + { + "epoch": 4.8611787627861665, + "grad_norm": 0.880993127822876, + "learning_rate": 0.0002, + "loss": 1.3006, + "step": 4990 + }, + { + "epoch": 4.870920603994155, + "grad_norm": 0.9472237825393677, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 5000 + }, + { + "epoch": 4.880662445202143, + "grad_norm": 0.9504236578941345, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5010 + }, + { + "epoch": 4.890404286410131, + "grad_norm": 1.1261742115020752, + "learning_rate": 0.0002, + "loss": 1.2791, + "step": 5020 + }, + { + "epoch": 4.90014612761812, + "grad_norm": 0.904674768447876, + "learning_rate": 0.0002, + "loss": 1.3707, + "step": 5030 + }, + { + "epoch": 4.909887968826109, + "grad_norm": 0.8828991055488586, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5040 + }, + { + "epoch": 4.919629810034096, + "grad_norm": 1.0156532526016235, + "learning_rate": 0.0002, + "loss": 1.2905, + "step": 5050 + }, + { + "epoch": 4.929371651242085, + "grad_norm": 0.8975168466567993, + "learning_rate": 0.0002, + "loss": 1.3079, + "step": 5060 + }, + { + "epoch": 4.939113492450073, + "grad_norm": 0.9787213802337646, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 5070 + }, + { + "epoch": 4.948855333658061, + "grad_norm": 1.0801568031311035, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 5080 + }, + { + "epoch": 4.95859717486605, + "grad_norm": 1.0655089616775513, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 5090 + }, + { + "epoch": 4.968339016074038, + "grad_norm": 0.8941320180892944, + "learning_rate": 0.0002, + "loss": 1.2449, + "step": 5100 + }, + { + "epoch": 4.978080857282026, + "grad_norm": 1.050621747970581, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 5110 + }, + { + "epoch": 4.987822698490015, + "grad_norm": 0.9724781513214111, + "learning_rate": 0.0002, + "loss": 1.3791, + "step": 5120 + }, + { + "epoch": 4.997564539698003, + "grad_norm": 0.9850538969039917, + "learning_rate": 0.0002, + "loss": 1.292, + "step": 5130 + }, + { + "epoch": 4.9995129079396, + "eval_loss": 2.0824170112609863, + "eval_runtime": 55.592, + "eval_samples_per_second": 9.12, + "eval_steps_per_second": 1.151, + "step": 5132 + }, + { + "epoch": 5.007306380905991, + "grad_norm": 1.0096189975738525, + "learning_rate": 0.0002, + "loss": 1.037, + "step": 5140 + }, + { + "epoch": 5.01704822211398, + "grad_norm": 1.2403408288955688, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 5150 + }, + { + "epoch": 5.026790063321968, + "grad_norm": 1.1243221759796143, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 5160 + }, + { + "epoch": 5.036531904529956, + "grad_norm": 1.4745502471923828, + "learning_rate": 0.0002, + "loss": 0.9815, + "step": 5170 + }, + { + "epoch": 5.0462737457379445, + "grad_norm": 1.1913198232650757, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 5180 + }, + { + "epoch": 5.056015586945933, + "grad_norm": 1.2732855081558228, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 5190 + }, + { + "epoch": 5.065757428153921, + "grad_norm": 1.1737396717071533, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 5200 + }, + { + "epoch": 5.075499269361909, + "grad_norm": 1.4162768125534058, + "learning_rate": 0.0002, + "loss": 0.9754, + "step": 5210 + }, + { + "epoch": 5.085241110569898, + "grad_norm": 1.528274655342102, + "learning_rate": 0.0002, + "loss": 1.0333, + "step": 5220 + }, + { + "epoch": 5.094982951777886, + "grad_norm": 1.3966618776321411, + "learning_rate": 0.0002, + "loss": 1.0227, + "step": 5230 + }, + { + "epoch": 5.104724792985874, + "grad_norm": 1.3427953720092773, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 5240 + }, + { + "epoch": 5.114466634193863, + "grad_norm": 1.6533905267715454, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 5250 + }, + { + "epoch": 5.124208475401851, + "grad_norm": 1.4114865064620972, + "learning_rate": 0.0002, + "loss": 1.0452, + "step": 5260 + }, + { + "epoch": 5.133950316609839, + "grad_norm": 1.5460708141326904, + "learning_rate": 0.0002, + "loss": 1.067, + "step": 5270 + }, + { + "epoch": 5.143692157817828, + "grad_norm": 1.3491919040679932, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 5280 + }, + { + "epoch": 5.153433999025816, + "grad_norm": 1.2208969593048096, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 5290 + }, + { + "epoch": 5.163175840233804, + "grad_norm": 1.1141403913497925, + "learning_rate": 0.0002, + "loss": 1.0362, + "step": 5300 + }, + { + "epoch": 5.172917681441793, + "grad_norm": 1.2938064336776733, + "learning_rate": 0.0002, + "loss": 0.9744, + "step": 5310 + }, + { + "epoch": 5.1826595226497805, + "grad_norm": 1.2704918384552002, + "learning_rate": 0.0002, + "loss": 1.0438, + "step": 5320 + }, + { + "epoch": 5.192401363857769, + "grad_norm": 1.3928544521331787, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 5330 + }, + { + "epoch": 5.202143205065758, + "grad_norm": 1.1993824243545532, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 5340 + }, + { + "epoch": 5.211885046273745, + "grad_norm": 1.5913670063018799, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 5350 + }, + { + "epoch": 5.221626887481734, + "grad_norm": 1.1577855348587036, + "learning_rate": 0.0002, + "loss": 1.0113, + "step": 5360 + }, + { + "epoch": 5.231368728689723, + "grad_norm": 1.4535993337631226, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 5370 + }, + { + "epoch": 5.24111056989771, + "grad_norm": 1.5068976879119873, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 5380 + }, + { + "epoch": 5.250852411105699, + "grad_norm": 1.2365459203720093, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 5390 + }, + { + "epoch": 5.2605942523136875, + "grad_norm": 1.3197922706604004, + "learning_rate": 0.0002, + "loss": 1.0145, + "step": 5400 + }, + { + "epoch": 5.270336093521675, + "grad_norm": 1.2395117282867432, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 5410 + }, + { + "epoch": 5.280077934729664, + "grad_norm": 1.1841236352920532, + "learning_rate": 0.0002, + "loss": 1.0292, + "step": 5420 + }, + { + "epoch": 5.289819775937652, + "grad_norm": 1.218003749847412, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 5430 + }, + { + "epoch": 5.29956161714564, + "grad_norm": 1.2210947275161743, + "learning_rate": 0.0002, + "loss": 1.0093, + "step": 5440 + }, + { + "epoch": 5.309303458353629, + "grad_norm": 1.266006588935852, + "learning_rate": 0.0002, + "loss": 0.9619, + "step": 5450 + }, + { + "epoch": 5.319045299561617, + "grad_norm": 1.2598075866699219, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 5460 + }, + { + "epoch": 5.328787140769606, + "grad_norm": 1.2410019636154175, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 5470 + }, + { + "epoch": 5.338528981977594, + "grad_norm": 1.249698519706726, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 5480 + }, + { + "epoch": 5.348270823185582, + "grad_norm": 1.2398173809051514, + "learning_rate": 0.0002, + "loss": 1.0457, + "step": 5490 + }, + { + "epoch": 5.35801266439357, + "grad_norm": 1.2416654825210571, + "learning_rate": 0.0002, + "loss": 1.0139, + "step": 5500 + }, + { + "epoch": 5.3677545056015585, + "grad_norm": 1.398706316947937, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 5510 + }, + { + "epoch": 5.377496346809547, + "grad_norm": 1.3049418926239014, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 5520 + }, + { + "epoch": 5.387238188017536, + "grad_norm": 1.2528893947601318, + "learning_rate": 0.0002, + "loss": 1.0912, + "step": 5530 + }, + { + "epoch": 5.3969800292255234, + "grad_norm": 1.2963255643844604, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 5540 + }, + { + "epoch": 5.406721870433512, + "grad_norm": 1.494231104850769, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 5550 + }, + { + "epoch": 5.416463711641501, + "grad_norm": 1.2760992050170898, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 5560 + }, + { + "epoch": 5.426205552849488, + "grad_norm": 1.195292592048645, + "learning_rate": 0.0002, + "loss": 1.1088, + "step": 5570 + }, + { + "epoch": 5.435947394057477, + "grad_norm": 1.6408965587615967, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 5580 + }, + { + "epoch": 5.4456892352654656, + "grad_norm": 1.3092058897018433, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 5590 + }, + { + "epoch": 5.455431076473453, + "grad_norm": 1.2960586547851562, + "learning_rate": 0.0002, + "loss": 1.006, + "step": 5600 + }, + { + "epoch": 5.465172917681442, + "grad_norm": 1.3560487031936646, + "learning_rate": 0.0002, + "loss": 1.0257, + "step": 5610 + }, + { + "epoch": 5.4749147588894305, + "grad_norm": 1.1896311044692993, + "learning_rate": 0.0002, + "loss": 1.0314, + "step": 5620 + }, + { + "epoch": 5.484656600097418, + "grad_norm": 1.3145595788955688, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 5630 + }, + { + "epoch": 5.494398441305407, + "grad_norm": 1.2207404375076294, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 5640 + }, + { + "epoch": 5.504140282513395, + "grad_norm": 1.266015887260437, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 5650 + }, + { + "epoch": 5.513882123721383, + "grad_norm": 1.2478289604187012, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 5660 + }, + { + "epoch": 5.523623964929372, + "grad_norm": 1.4851372241973877, + "learning_rate": 0.0002, + "loss": 1.0695, + "step": 5670 + }, + { + "epoch": 5.53336580613736, + "grad_norm": 1.4478679895401, + "learning_rate": 0.0002, + "loss": 1.0736, + "step": 5680 + }, + { + "epoch": 5.543107647345348, + "grad_norm": 1.1079537868499756, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 5690 + }, + { + "epoch": 5.552849488553337, + "grad_norm": 1.4201879501342773, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 5700 + }, + { + "epoch": 5.562591329761325, + "grad_norm": 1.2092000246047974, + "learning_rate": 0.0002, + "loss": 1.0697, + "step": 5710 + }, + { + "epoch": 5.572333170969313, + "grad_norm": 1.4515851736068726, + "learning_rate": 0.0002, + "loss": 0.9868, + "step": 5720 + }, + { + "epoch": 5.5820750121773015, + "grad_norm": 1.3260412216186523, + "learning_rate": 0.0002, + "loss": 1.1547, + "step": 5730 + }, + { + "epoch": 5.59181685338529, + "grad_norm": 1.248191475868225, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 5740 + }, + { + "epoch": 5.601558694593278, + "grad_norm": 1.2037307024002075, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 5750 + }, + { + "epoch": 5.611300535801266, + "grad_norm": 1.341237187385559, + "learning_rate": 0.0002, + "loss": 1.1425, + "step": 5760 + }, + { + "epoch": 5.621042377009255, + "grad_norm": 1.130115270614624, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 5770 + }, + { + "epoch": 5.630784218217243, + "grad_norm": 1.3834772109985352, + "learning_rate": 0.0002, + "loss": 1.1029, + "step": 5780 + }, + { + "epoch": 5.640526059425231, + "grad_norm": 1.2586270570755005, + "learning_rate": 0.0002, + "loss": 1.0825, + "step": 5790 + }, + { + "epoch": 5.65026790063322, + "grad_norm": 1.3233023881912231, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 5800 + }, + { + "epoch": 5.660009741841208, + "grad_norm": 1.2711341381072998, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 5810 + }, + { + "epoch": 5.669751583049196, + "grad_norm": 1.3867720365524292, + "learning_rate": 0.0002, + "loss": 1.0897, + "step": 5820 + }, + { + "epoch": 5.679493424257185, + "grad_norm": 1.4783269166946411, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 5830 + }, + { + "epoch": 5.6892352654651726, + "grad_norm": 1.2744768857955933, + "learning_rate": 0.0002, + "loss": 1.0632, + "step": 5840 + }, + { + "epoch": 5.698977106673161, + "grad_norm": 1.3405882120132446, + "learning_rate": 0.0002, + "loss": 1.1484, + "step": 5850 + }, + { + "epoch": 5.70871894788115, + "grad_norm": 1.204300880432129, + "learning_rate": 0.0002, + "loss": 1.0975, + "step": 5860 + }, + { + "epoch": 5.7184607890891375, + "grad_norm": 1.2954572439193726, + "learning_rate": 0.0002, + "loss": 1.0494, + "step": 5870 + }, + { + "epoch": 5.728202630297126, + "grad_norm": 1.5478382110595703, + "learning_rate": 0.0002, + "loss": 1.0643, + "step": 5880 + }, + { + "epoch": 5.737944471505115, + "grad_norm": 1.2095842361450195, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 5890 + }, + { + "epoch": 5.747686312713103, + "grad_norm": 1.0691519975662231, + "learning_rate": 0.0002, + "loss": 1.1, + "step": 5900 + }, + { + "epoch": 5.757428153921091, + "grad_norm": 1.1920677423477173, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 5910 + }, + { + "epoch": 5.76716999512908, + "grad_norm": 1.2051277160644531, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 5920 + }, + { + "epoch": 5.776911836337067, + "grad_norm": 1.197490930557251, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 5930 + }, + { + "epoch": 5.786653677545056, + "grad_norm": 1.2003998756408691, + "learning_rate": 0.0002, + "loss": 1.07, + "step": 5940 + }, + { + "epoch": 5.7963955187530445, + "grad_norm": 1.2323646545410156, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 5950 + }, + { + "epoch": 5.806137359961033, + "grad_norm": 1.2593932151794434, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 5960 + }, + { + "epoch": 5.815879201169021, + "grad_norm": 1.1835976839065552, + "learning_rate": 0.0002, + "loss": 1.0829, + "step": 5970 + }, + { + "epoch": 5.825621042377009, + "grad_norm": 1.4770104885101318, + "learning_rate": 0.0002, + "loss": 1.1056, + "step": 5980 + }, + { + "epoch": 5.835362883584997, + "grad_norm": 1.1025809049606323, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 5990 + }, + { + "epoch": 5.845104724792986, + "grad_norm": 1.364588975906372, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 6000 + }, + { + "epoch": 5.854846566000974, + "grad_norm": 1.2340112924575806, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 6010 + }, + { + "epoch": 5.864588407208963, + "grad_norm": 1.4925711154937744, + "learning_rate": 0.0002, + "loss": 1.1123, + "step": 6020 + }, + { + "epoch": 5.874330248416951, + "grad_norm": 1.3516744375228882, + "learning_rate": 0.0002, + "loss": 1.12, + "step": 6030 + }, + { + "epoch": 5.884072089624939, + "grad_norm": 1.2058138847351074, + "learning_rate": 0.0002, + "loss": 1.1399, + "step": 6040 + }, + { + "epoch": 5.893813930832927, + "grad_norm": 1.13870108127594, + "learning_rate": 0.0002, + "loss": 1.1074, + "step": 6050 + }, + { + "epoch": 5.9035557720409155, + "grad_norm": 1.1587319374084473, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 6060 + }, + { + "epoch": 5.913297613248904, + "grad_norm": 1.164481520652771, + "learning_rate": 0.0002, + "loss": 1.1376, + "step": 6070 + }, + { + "epoch": 5.923039454456893, + "grad_norm": 1.2115206718444824, + "learning_rate": 0.0002, + "loss": 1.1262, + "step": 6080 + }, + { + "epoch": 5.93278129566488, + "grad_norm": 1.3201590776443481, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 6090 + }, + { + "epoch": 5.942523136872869, + "grad_norm": 1.287380576133728, + "learning_rate": 0.0002, + "loss": 1.1288, + "step": 6100 + }, + { + "epoch": 5.952264978080858, + "grad_norm": 1.1820166110992432, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6110 + }, + { + "epoch": 5.962006819288845, + "grad_norm": 1.2550667524337769, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 6120 + }, + { + "epoch": 5.971748660496834, + "grad_norm": 1.3547813892364502, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 6130 + }, + { + "epoch": 5.9814905017048225, + "grad_norm": 1.260842204093933, + "learning_rate": 0.0002, + "loss": 1.0557, + "step": 6140 + }, + { + "epoch": 5.99123234291281, + "grad_norm": 1.1643036603927612, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 6150 + }, + { + "epoch": 6.0, + "eval_loss": 2.2628161907196045, + "eval_runtime": 57.2379, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 1.118, + "step": 6159 + }, + { + "epoch": 6.000974184120799, + "grad_norm": 0.9384723901748657, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 6160 + }, + { + "epoch": 6.0107160253287875, + "grad_norm": 2.1525821685791016, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6170 + }, + { + "epoch": 6.020457866536775, + "grad_norm": 2.0194077491760254, + "learning_rate": 0.0002, + "loss": 0.8416, + "step": 6180 + }, + { + "epoch": 6.030199707744764, + "grad_norm": 1.5257816314697266, + "learning_rate": 0.0002, + "loss": 0.8443, + "step": 6190 + }, + { + "epoch": 6.039941548952752, + "grad_norm": 1.5432662963867188, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 6200 + }, + { + "epoch": 6.04968339016074, + "grad_norm": 1.6874405145645142, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 6210 + }, + { + "epoch": 6.059425231368729, + "grad_norm": 1.7346407175064087, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 6220 + }, + { + "epoch": 6.069167072576717, + "grad_norm": 1.5320781469345093, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6230 + }, + { + "epoch": 6.078908913784705, + "grad_norm": 1.4106669425964355, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 6240 + }, + { + "epoch": 6.088650754992694, + "grad_norm": 1.5568628311157227, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 6250 + }, + { + "epoch": 6.098392596200682, + "grad_norm": 1.6155978441238403, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 6260 + }, + { + "epoch": 6.10813443740867, + "grad_norm": 1.4820445775985718, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 6270 + }, + { + "epoch": 6.1178762786166585, + "grad_norm": 1.6163820028305054, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 6280 + }, + { + "epoch": 6.127618119824647, + "grad_norm": 1.8396387100219727, + "learning_rate": 0.0002, + "loss": 0.853, + "step": 6290 + }, + { + "epoch": 6.137359961032635, + "grad_norm": 1.7181230783462524, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 6300 + }, + { + "epoch": 6.147101802240623, + "grad_norm": 1.6568509340286255, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 6310 + }, + { + "epoch": 6.156843643448612, + "grad_norm": 1.3481947183609009, + "learning_rate": 0.0002, + "loss": 0.8525, + "step": 6320 + }, + { + "epoch": 6.1665854846566, + "grad_norm": 1.5788342952728271, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 6330 + }, + { + "epoch": 6.176327325864588, + "grad_norm": 1.5067620277404785, + "learning_rate": 0.0002, + "loss": 0.886, + "step": 6340 + }, + { + "epoch": 6.186069167072577, + "grad_norm": 1.8198208808898926, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 6350 + }, + { + "epoch": 6.195811008280565, + "grad_norm": 1.4012749195098877, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6360 + }, + { + "epoch": 6.205552849488553, + "grad_norm": 1.759798288345337, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 6370 + }, + { + "epoch": 6.215294690696542, + "grad_norm": 1.468922734260559, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6380 + }, + { + "epoch": 6.2250365319045295, + "grad_norm": 1.3706471920013428, + "learning_rate": 0.0002, + "loss": 0.8356, + "step": 6390 + }, + { + "epoch": 6.234778373112518, + "grad_norm": 1.6397383213043213, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 6400 + }, + { + "epoch": 6.244520214320507, + "grad_norm": 1.5614187717437744, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 6410 + }, + { + "epoch": 6.2542620555284945, + "grad_norm": 1.7118678092956543, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 6420 + }, + { + "epoch": 6.264003896736483, + "grad_norm": 1.4041547775268555, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 6430 + }, + { + "epoch": 6.273745737944472, + "grad_norm": 1.7653605937957764, + "learning_rate": 0.0002, + "loss": 0.879, + "step": 6440 + }, + { + "epoch": 6.28348757915246, + "grad_norm": 2.6219191551208496, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 6450 + }, + { + "epoch": 6.293229420360448, + "grad_norm": 1.4757837057113647, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 6460 + }, + { + "epoch": 6.302971261568437, + "grad_norm": 1.715598225593567, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 6470 + }, + { + "epoch": 6.312713102776424, + "grad_norm": 1.376216173171997, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 6480 + }, + { + "epoch": 6.322454943984413, + "grad_norm": 1.7119828462600708, + "learning_rate": 0.0002, + "loss": 0.8742, + "step": 6490 + }, + { + "epoch": 6.3321967851924015, + "grad_norm": 1.4304355382919312, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 6500 + }, + { + "epoch": 6.34193862640039, + "grad_norm": 1.4889872074127197, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 6510 + }, + { + "epoch": 6.351680467608378, + "grad_norm": 1.370373010635376, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 6520 + }, + { + "epoch": 6.361422308816366, + "grad_norm": 1.7697709798812866, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 6530 + }, + { + "epoch": 6.371164150024355, + "grad_norm": 1.495297908782959, + "learning_rate": 0.0002, + "loss": 0.9421, + "step": 6540 + }, + { + "epoch": 6.380905991232343, + "grad_norm": 1.7251347303390503, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 6550 + }, + { + "epoch": 6.390647832440331, + "grad_norm": 1.6909505128860474, + "learning_rate": 0.0002, + "loss": 0.9327, + "step": 6560 + }, + { + "epoch": 6.40038967364832, + "grad_norm": 1.4369314908981323, + "learning_rate": 0.0002, + "loss": 0.837, + "step": 6570 + }, + { + "epoch": 6.410131514856308, + "grad_norm": 1.7803739309310913, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 6580 + }, + { + "epoch": 6.419873356064296, + "grad_norm": 1.6107097864151, + "learning_rate": 0.0002, + "loss": 0.9024, + "step": 6590 + }, + { + "epoch": 6.429615197272285, + "grad_norm": 1.6151643991470337, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 6600 + }, + { + "epoch": 6.4393570384802725, + "grad_norm": 1.7159833908081055, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 6610 + }, + { + "epoch": 6.449098879688261, + "grad_norm": 1.4366064071655273, + "learning_rate": 0.0002, + "loss": 0.9249, + "step": 6620 + }, + { + "epoch": 6.45884072089625, + "grad_norm": 1.6050453186035156, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 6630 + }, + { + "epoch": 6.468582562104237, + "grad_norm": 1.6296740770339966, + "learning_rate": 0.0002, + "loss": 0.8943, + "step": 6640 + }, + { + "epoch": 6.478324403312226, + "grad_norm": 1.6181174516677856, + "learning_rate": 0.0002, + "loss": 0.9228, + "step": 6650 + }, + { + "epoch": 6.488066244520215, + "grad_norm": 1.5452176332473755, + "learning_rate": 0.0002, + "loss": 0.9139, + "step": 6660 + }, + { + "epoch": 6.497808085728202, + "grad_norm": 1.3919731378555298, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 6670 + }, + { + "epoch": 6.507549926936191, + "grad_norm": 1.6456257104873657, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 6680 + }, + { + "epoch": 6.5172917681441795, + "grad_norm": 1.4147369861602783, + "learning_rate": 0.0002, + "loss": 0.9041, + "step": 6690 + }, + { + "epoch": 6.527033609352167, + "grad_norm": 1.7005025148391724, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 6700 + }, + { + "epoch": 6.536775450560156, + "grad_norm": 1.6032357215881348, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 6710 + }, + { + "epoch": 6.5465172917681445, + "grad_norm": 1.3454229831695557, + "learning_rate": 0.0002, + "loss": 0.9796, + "step": 6720 + }, + { + "epoch": 6.556259132976132, + "grad_norm": 1.6961418390274048, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 6730 + }, + { + "epoch": 6.566000974184121, + "grad_norm": 1.78407883644104, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 6740 + }, + { + "epoch": 6.575742815392109, + "grad_norm": 1.6817889213562012, + "learning_rate": 0.0002, + "loss": 0.8941, + "step": 6750 + }, + { + "epoch": 6.585484656600097, + "grad_norm": 1.7894943952560425, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 6760 + }, + { + "epoch": 6.595226497808086, + "grad_norm": 1.6404837369918823, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 6770 + }, + { + "epoch": 6.604968339016074, + "grad_norm": 1.5849255323410034, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 6780 + }, + { + "epoch": 6.614710180224062, + "grad_norm": 1.5993813276290894, + "learning_rate": 0.0002, + "loss": 0.9575, + "step": 6790 + }, + { + "epoch": 6.624452021432051, + "grad_norm": 1.2834863662719727, + "learning_rate": 0.0002, + "loss": 0.8922, + "step": 6800 + }, + { + "epoch": 6.634193862640039, + "grad_norm": 1.7215641736984253, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 6810 + }, + { + "epoch": 6.643935703848027, + "grad_norm": 1.7588146924972534, + "learning_rate": 0.0002, + "loss": 0.9292, + "step": 6820 + }, + { + "epoch": 6.6536775450560155, + "grad_norm": 1.7956023216247559, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 6830 + }, + { + "epoch": 6.663419386264004, + "grad_norm": 1.5115351676940918, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 6840 + }, + { + "epoch": 6.673161227471992, + "grad_norm": 1.5660319328308105, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 6850 + }, + { + "epoch": 6.68290306867998, + "grad_norm": 1.4323679208755493, + "learning_rate": 0.0002, + "loss": 0.9877, + "step": 6860 + }, + { + "epoch": 6.692644909887969, + "grad_norm": 1.662089467048645, + "learning_rate": 0.0002, + "loss": 0.8732, + "step": 6870 + }, + { + "epoch": 6.702386751095958, + "grad_norm": 1.7854869365692139, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6880 + }, + { + "epoch": 6.712128592303945, + "grad_norm": 1.5491222143173218, + "learning_rate": 0.0002, + "loss": 0.9105, + "step": 6890 + }, + { + "epoch": 6.721870433511934, + "grad_norm": 1.5946987867355347, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 6900 + }, + { + "epoch": 6.731612274719922, + "grad_norm": 1.6195964813232422, + "learning_rate": 0.0002, + "loss": 0.9391, + "step": 6910 + }, + { + "epoch": 6.74135411592791, + "grad_norm": 1.6366901397705078, + "learning_rate": 0.0002, + "loss": 0.8947, + "step": 6920 + }, + { + "epoch": 6.751095957135899, + "grad_norm": 1.5080382823944092, + "learning_rate": 0.0002, + "loss": 0.8695, + "step": 6930 + }, + { + "epoch": 6.760837798343887, + "grad_norm": 1.742353916168213, + "learning_rate": 0.0002, + "loss": 0.9124, + "step": 6940 + }, + { + "epoch": 6.770579639551875, + "grad_norm": 1.690251111984253, + "learning_rate": 0.0002, + "loss": 0.9118, + "step": 6950 + }, + { + "epoch": 6.780321480759864, + "grad_norm": 1.7103357315063477, + "learning_rate": 0.0002, + "loss": 0.9039, + "step": 6960 + }, + { + "epoch": 6.7900633219678515, + "grad_norm": 1.6630914211273193, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 6970 + }, + { + "epoch": 6.79980516317584, + "grad_norm": 1.423768162727356, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 6980 + }, + { + "epoch": 6.809547004383829, + "grad_norm": 1.7844693660736084, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 6990 + }, + { + "epoch": 6.819288845591817, + "grad_norm": 1.545282006263733, + "learning_rate": 0.0002, + "loss": 0.8889, + "step": 7000 + }, + { + "epoch": 6.829030686799805, + "grad_norm": 1.4340319633483887, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 7010 + }, + { + "epoch": 6.838772528007794, + "grad_norm": 1.5981626510620117, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 7020 + }, + { + "epoch": 6.848514369215782, + "grad_norm": 1.5205026865005493, + "learning_rate": 0.0002, + "loss": 0.9062, + "step": 7030 + }, + { + "epoch": 6.85825621042377, + "grad_norm": 1.6999989748001099, + "learning_rate": 0.0002, + "loss": 0.9245, + "step": 7040 + }, + { + "epoch": 6.8679980516317585, + "grad_norm": 1.6392347812652588, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 7050 + }, + { + "epoch": 6.877739892839747, + "grad_norm": 1.637308955192566, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 7060 + }, + { + "epoch": 6.887481734047735, + "grad_norm": 1.671341896057129, + "learning_rate": 0.0002, + "loss": 0.9672, + "step": 7070 + }, + { + "epoch": 6.897223575255723, + "grad_norm": 1.4437555074691772, + "learning_rate": 0.0002, + "loss": 0.9726, + "step": 7080 + }, + { + "epoch": 6.906965416463712, + "grad_norm": 1.4251935482025146, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 7090 + }, + { + "epoch": 6.9167072576717, + "grad_norm": 1.5106734037399292, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 7100 + }, + { + "epoch": 6.926449098879688, + "grad_norm": 1.670742154121399, + "learning_rate": 0.0002, + "loss": 0.939, + "step": 7110 + }, + { + "epoch": 6.936190940087677, + "grad_norm": 1.4353723526000977, + "learning_rate": 0.0002, + "loss": 0.8818, + "step": 7120 + }, + { + "epoch": 6.945932781295665, + "grad_norm": 1.9437772035598755, + "learning_rate": 0.0002, + "loss": 0.9354, + "step": 7130 + }, + { + "epoch": 6.955674622503653, + "grad_norm": 1.4922038316726685, + "learning_rate": 0.0002, + "loss": 0.9623, + "step": 7140 + }, + { + "epoch": 6.965416463711642, + "grad_norm": 1.489193081855774, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 7150 + }, + { + "epoch": 6.9751583049196295, + "grad_norm": 1.529490351676941, + "learning_rate": 0.0002, + "loss": 1.0024, + "step": 7160 + }, + { + "epoch": 6.984900146127618, + "grad_norm": 1.7370105981826782, + "learning_rate": 0.0002, + "loss": 0.9715, + "step": 7170 + }, + { + "epoch": 6.994641987335607, + "grad_norm": 1.5639604330062866, + "learning_rate": 0.0002, + "loss": 0.921, + "step": 7180 + }, + { + "epoch": 6.9995129079396, + "eval_loss": 2.521758794784546, + "eval_runtime": 56.1587, + "eval_samples_per_second": 9.028, + "eval_steps_per_second": 1.14, + "step": 7185 + }, + { + "epoch": 7.004383828543594, + "grad_norm": 1.391621470451355, + "learning_rate": 0.0002, + "loss": 0.8682, + "step": 7190 + }, + { + "epoch": 7.014125669751583, + "grad_norm": 2.3696491718292236, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 7200 + }, + { + "epoch": 7.023867510959572, + "grad_norm": 1.6873828172683716, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 7210 + }, + { + "epoch": 7.033609352167559, + "grad_norm": 1.8893300294876099, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 7220 + }, + { + "epoch": 7.043351193375548, + "grad_norm": 1.6323082447052002, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 7230 + }, + { + "epoch": 7.0530930345835365, + "grad_norm": 1.9979127645492554, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 7240 + }, + { + "epoch": 7.062834875791524, + "grad_norm": 2.0339183807373047, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 7250 + }, + { + "epoch": 7.072576716999513, + "grad_norm": 1.6820781230926514, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 7260 + }, + { + "epoch": 7.0823185582075014, + "grad_norm": 2.0400710105895996, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7270 + }, + { + "epoch": 7.092060399415489, + "grad_norm": 2.13495135307312, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 7280 + }, + { + "epoch": 7.101802240623478, + "grad_norm": 1.6993554830551147, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 7290 + }, + { + "epoch": 7.111544081831466, + "grad_norm": 1.9262464046478271, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 7300 + }, + { + "epoch": 7.121285923039454, + "grad_norm": 1.8407244682312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7310 + }, + { + "epoch": 7.131027764247443, + "grad_norm": 1.744294285774231, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 7320 + }, + { + "epoch": 7.140769605455431, + "grad_norm": 1.7602320909500122, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7330 + }, + { + "epoch": 7.150511446663419, + "grad_norm": 1.7360851764678955, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 7340 + }, + { + "epoch": 7.160253287871408, + "grad_norm": 2.0012850761413574, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 7350 + }, + { + "epoch": 7.169995129079396, + "grad_norm": 2.064319372177124, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 7360 + }, + { + "epoch": 7.179736970287384, + "grad_norm": 1.4556169509887695, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 7370 + }, + { + "epoch": 7.1894788114953725, + "grad_norm": 2.365649938583374, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 7380 + }, + { + "epoch": 7.199220652703361, + "grad_norm": 1.8271889686584473, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 7390 + }, + { + "epoch": 7.208962493911349, + "grad_norm": 1.9143747091293335, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 7400 + }, + { + "epoch": 7.218704335119337, + "grad_norm": 1.5670185089111328, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7410 + }, + { + "epoch": 7.228446176327326, + "grad_norm": 1.7452768087387085, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 7420 + }, + { + "epoch": 7.238188017535315, + "grad_norm": 1.7830921411514282, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 7430 + }, + { + "epoch": 7.247929858743302, + "grad_norm": 1.9281501770019531, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 7440 + }, + { + "epoch": 7.257671699951291, + "grad_norm": 1.889663815498352, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 7450 + }, + { + "epoch": 7.2674135411592795, + "grad_norm": 1.704999566078186, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 7460 + }, + { + "epoch": 7.277155382367267, + "grad_norm": 1.824109435081482, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 7470 + }, + { + "epoch": 7.286897223575256, + "grad_norm": 1.5378915071487427, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 7480 + }, + { + "epoch": 7.296639064783244, + "grad_norm": 1.830587387084961, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 7490 + }, + { + "epoch": 7.306380905991232, + "grad_norm": 2.0029330253601074, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 7500 + }, + { + "epoch": 7.316122747199221, + "grad_norm": 2.0871448516845703, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 7510 + }, + { + "epoch": 7.325864588407209, + "grad_norm": 1.8416074514389038, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 7520 + }, + { + "epoch": 7.335606429615197, + "grad_norm": 1.8962771892547607, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 7530 + }, + { + "epoch": 7.345348270823186, + "grad_norm": 1.899487018585205, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 7540 + }, + { + "epoch": 7.355090112031174, + "grad_norm": 1.8300765752792358, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 7550 + }, + { + "epoch": 7.364831953239162, + "grad_norm": 2.178112268447876, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 7560 + }, + { + "epoch": 7.3745737944471506, + "grad_norm": 1.8472180366516113, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 7570 + }, + { + "epoch": 7.384315635655139, + "grad_norm": 1.7787587642669678, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 7580 + }, + { + "epoch": 7.394057476863127, + "grad_norm": 1.8309564590454102, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 7590 + }, + { + "epoch": 7.4037993180711155, + "grad_norm": 2.028923273086548, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 7600 + }, + { + "epoch": 7.413541159279104, + "grad_norm": 1.7393525838851929, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 7610 + }, + { + "epoch": 7.423283000487092, + "grad_norm": 1.8816498517990112, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 7620 + }, + { + "epoch": 7.43302484169508, + "grad_norm": 2.4553585052490234, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7630 + }, + { + "epoch": 7.442766682903069, + "grad_norm": 1.9045933485031128, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7640 + }, + { + "epoch": 7.452508524111057, + "grad_norm": 1.664156198501587, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 7650 + }, + { + "epoch": 7.462250365319045, + "grad_norm": 1.792748212814331, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7660 + }, + { + "epoch": 7.471992206527034, + "grad_norm": 1.8481247425079346, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 7670 + }, + { + "epoch": 7.481734047735022, + "grad_norm": 2.0541393756866455, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 7680 + }, + { + "epoch": 7.49147588894301, + "grad_norm": 1.594969630241394, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 7690 + }, + { + "epoch": 7.501217730150999, + "grad_norm": 2.1409924030303955, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 7700 + }, + { + "epoch": 7.5109595713589865, + "grad_norm": 1.9743319749832153, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 7710 + }, + { + "epoch": 7.520701412566975, + "grad_norm": 1.866410493850708, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 7720 + }, + { + "epoch": 7.530443253774964, + "grad_norm": 1.9087774753570557, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 7730 + }, + { + "epoch": 7.540185094982951, + "grad_norm": 1.8624005317687988, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 7740 + }, + { + "epoch": 7.54992693619094, + "grad_norm": 1.629889726638794, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 7750 + }, + { + "epoch": 7.559668777398929, + "grad_norm": 2.1364638805389404, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 7760 + }, + { + "epoch": 7.569410618606916, + "grad_norm": 1.591701865196228, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 7770 + }, + { + "epoch": 7.579152459814905, + "grad_norm": 2.3200602531433105, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 7780 + }, + { + "epoch": 7.5888943010228935, + "grad_norm": 1.9998793601989746, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 7790 + }, + { + "epoch": 7.598636142230882, + "grad_norm": 1.8921900987625122, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 7800 + }, + { + "epoch": 7.60837798343887, + "grad_norm": 1.8826839923858643, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 7810 + }, + { + "epoch": 7.618119824646858, + "grad_norm": 1.8796452283859253, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 7820 + }, + { + "epoch": 7.627861665854846, + "grad_norm": 1.6528139114379883, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 7830 + }, + { + "epoch": 7.637603507062835, + "grad_norm": 1.9646536111831665, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 7840 + }, + { + "epoch": 7.647345348270823, + "grad_norm": 1.6951191425323486, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7850 + }, + { + "epoch": 7.657087189478812, + "grad_norm": 1.8734302520751953, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 7860 + }, + { + "epoch": 7.6668290306868, + "grad_norm": 2.140984058380127, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 7870 + }, + { + "epoch": 7.676570871894788, + "grad_norm": 1.8852670192718506, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 7880 + }, + { + "epoch": 7.686312713102776, + "grad_norm": 2.1172003746032715, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 7890 + }, + { + "epoch": 7.696054554310765, + "grad_norm": 1.8237593173980713, + "learning_rate": 0.0002, + "loss": 0.7796, + "step": 7900 + }, + { + "epoch": 7.705796395518753, + "grad_norm": 2.1399245262145996, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 7910 + }, + { + "epoch": 7.715538236726742, + "grad_norm": 1.8119547367095947, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 7920 + }, + { + "epoch": 7.7252800779347295, + "grad_norm": 1.943442463874817, + "learning_rate": 0.0002, + "loss": 0.7826, + "step": 7930 + }, + { + "epoch": 7.735021919142718, + "grad_norm": 1.6926734447479248, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 7940 + }, + { + "epoch": 7.744763760350706, + "grad_norm": 1.6824363470077515, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 7950 + }, + { + "epoch": 7.754505601558694, + "grad_norm": 1.8615055084228516, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 7960 + }, + { + "epoch": 7.764247442766683, + "grad_norm": 1.7171595096588135, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 7970 + }, + { + "epoch": 7.773989283974672, + "grad_norm": 1.9871152639389038, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 7980 + }, + { + "epoch": 7.783731125182659, + "grad_norm": 1.8975892066955566, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 7990 + }, + { + "epoch": 7.793472966390648, + "grad_norm": 1.8259385824203491, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 8000 + }, + { + "epoch": 7.8032148075986365, + "grad_norm": 2.2361183166503906, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 8010 + }, + { + "epoch": 7.812956648806624, + "grad_norm": 1.64067804813385, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 8020 + }, + { + "epoch": 7.822698490014613, + "grad_norm": 2.0037248134613037, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8030 + }, + { + "epoch": 7.832440331222601, + "grad_norm": 1.8022961616516113, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 8040 + }, + { + "epoch": 7.842182172430589, + "grad_norm": 1.9980754852294922, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8050 + }, + { + "epoch": 7.851924013638578, + "grad_norm": 1.632716178894043, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8060 + }, + { + "epoch": 7.861665854846566, + "grad_norm": 1.6348111629486084, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 8070 + }, + { + "epoch": 7.871407696054554, + "grad_norm": 1.968295693397522, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 8080 + }, + { + "epoch": 7.881149537262543, + "grad_norm": 1.6947685480117798, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 8090 + }, + { + "epoch": 7.890891378470531, + "grad_norm": 6.1600341796875, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 8100 + }, + { + "epoch": 7.900633219678519, + "grad_norm": 1.9334033727645874, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8110 + }, + { + "epoch": 7.9103750608865075, + "grad_norm": 1.729058027267456, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 8120 + }, + { + "epoch": 7.920116902094496, + "grad_norm": 1.8671422004699707, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 8130 + }, + { + "epoch": 7.929858743302484, + "grad_norm": 1.9794875383377075, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 8140 + }, + { + "epoch": 7.9396005845104725, + "grad_norm": 1.812229037284851, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 8150 + }, + { + "epoch": 7.949342425718461, + "grad_norm": 1.7354048490524292, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 8160 + }, + { + "epoch": 7.959084266926449, + "grad_norm": 1.7386713027954102, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 8170 + }, + { + "epoch": 7.968826108134437, + "grad_norm": 1.917111873626709, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 8180 + }, + { + "epoch": 7.978567949342426, + "grad_norm": 1.7007793188095093, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 8190 + }, + { + "epoch": 7.988309790550414, + "grad_norm": 1.8241386413574219, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 8200 + }, + { + "epoch": 7.996103263516805, + "eval_loss": 2.877988576889038, + "eval_runtime": 57.2196, + "eval_samples_per_second": 8.861, + "eval_steps_per_second": 1.118, + "step": 8208 + } + ], + "logging_steps": 10, + "max_steps": 8208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.215936241159373e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-8208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f9dd281c79a14a5c823e817e96ff2fd5ff07e747 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e8ef519de92b2db62801feef14bb344bb8aa4b619a17c82a1cf2d14a9cfb5a +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_log.jsonl b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..df706e3971e5a2da92eb2bc4eb21dc663e73817a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/training_log.jsonl @@ -0,0 +1,14 @@ +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 1643.7049367427826, "total_accumulated_duration": 1643.7049367427826, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0867, "grad_norm": 0.9572330713272095, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.5036, "grad_norm": 0.48326826095581055, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2889, "grad_norm": 0.518182098865509, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0426, "grad_norm": 0.7774201035499573, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9571, "grad_norm": 0.48625442385673523, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.9898, "grad_norm": 0.73747318983078, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.985, "grad_norm": 1.0816543102264404, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9356, "grad_norm": 0.5395983457565308, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9234, "grad_norm": 0.4758393466472626, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8904, "grad_norm": 0.391179621219635, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9042, "grad_norm": 0.41491082310676575, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8507, "grad_norm": 0.3800058364868164, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.8988, "grad_norm": 0.3326554596424103, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8453, "grad_norm": 0.3734934329986572, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7938, "grad_norm": 0.97270667552948, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8532, "grad_norm": 0.3579750657081604, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8489, "grad_norm": 0.474427193403244, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8452, "grad_norm": 0.4048471748828888, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8921, "grad_norm": 0.3494202792644501, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7686, "grad_norm": 0.3681524991989136, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8426, "grad_norm": 0.4028656482696533, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8826, "grad_norm": 0.3967253267765045, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7559, "grad_norm": 0.3345787823200226, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.8261, "grad_norm": 0.3374127149581909, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8527, "grad_norm": 0.31277430057525635, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7443, "grad_norm": 0.35254907608032227, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8618, "grad_norm": 0.34836333990097046, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8531, "grad_norm": 0.37661314010620117, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8427, "grad_norm": 0.3196435570716858, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.8193, "grad_norm": 0.33571064472198486, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7663, "grad_norm": 0.3384968638420105, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8286, "grad_norm": 0.3517991602420807, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8181, "grad_norm": 0.32833170890808105, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8225, "grad_norm": 0.335313618183136, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9181, "grad_norm": 0.35762909054756165, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7967, "grad_norm": 0.3138239085674286, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8608, "grad_norm": 0.36576879024505615, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8007, "grad_norm": 0.3099065124988556, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9198, "grad_norm": 0.30725058913230896, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7981, "grad_norm": 0.32897332310676575, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8976, "grad_norm": 0.3343771994113922, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8105, "grad_norm": 0.4490242302417755, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8567, "grad_norm": 0.33699190616607666, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8487, "grad_norm": 0.3511468768119812, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8524, "grad_norm": 0.313871830701828, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8469, "grad_norm": 0.4389854967594147, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7898, "grad_norm": 0.34989842772483826, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7562, "grad_norm": 0.4221613109111786, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7859, "grad_norm": 0.44692835211753845, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8261, "grad_norm": 0.3849453330039978, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8033, "grad_norm": 0.3177783191204071, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6953, "grad_norm": 0.31056079268455505, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.827, "grad_norm": 0.3379001021385193, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7737, "grad_norm": 0.3038215637207031, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.785, "grad_norm": 0.298612117767334, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.823, "grad_norm": 0.3020114004611969, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.836, "grad_norm": 0.3560065031051636, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8287, "grad_norm": 0.3355957865715027, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7066, "grad_norm": 0.44496026635169983, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7669, "grad_norm": 0.33643242716789246, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.836, "grad_norm": 0.3368454873561859, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7292, "grad_norm": 0.34035852551460266, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8323, "grad_norm": 0.35322463512420654, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8072, "grad_norm": 0.3246999979019165, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7479, "grad_norm": 0.32876884937286377, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8684, "grad_norm": 0.28478550910949707, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8436, "grad_norm": 0.2967577278614044, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8472, "grad_norm": 0.30370232462882996, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7885, "grad_norm": 0.32845404744148254, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8463, "grad_norm": 0.3011477589607239, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7829, "grad_norm": 0.2819054126739502, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8117, "grad_norm": 0.2947770059108734, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8057, "grad_norm": 0.3109278976917267, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7735, "grad_norm": 0.2782546877861023, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7394, "grad_norm": 0.3449978232383728, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7618, "grad_norm": 0.2844734191894531, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8394, "grad_norm": 0.32151708006858826, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8188, "grad_norm": 0.3154081106185913, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8075, "grad_norm": 0.27891838550567627, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.31346985697746277, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7947, "grad_norm": 0.2725819945335388, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7644, "grad_norm": 0.33911818265914917, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7545, "grad_norm": 0.28316545486450195, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7547, "grad_norm": 0.3096636235713959, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7917, "grad_norm": 0.3493746817111969, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7496, "grad_norm": 0.3246673345565796, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7177, "grad_norm": 0.31515568494796753, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7839, "grad_norm": 0.2828545570373535, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7106, "grad_norm": 0.31354808807373047, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7714, "grad_norm": 0.3020916283130646, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7234, "grad_norm": 0.32516390085220337, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7829, "grad_norm": 0.3393784165382385, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8264, "grad_norm": 0.36150357127189636, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8358, "grad_norm": 0.31706321239471436, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8244, "grad_norm": 0.33156445622444153, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8231, "grad_norm": 0.3012140989303589, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7741, "grad_norm": 0.35591211915016174, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8081, "grad_norm": 0.3784671425819397, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8921, "grad_norm": 0.3150336444377899, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8511, "grad_norm": 0.31587284803390503, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.8018, "grad_norm": 0.3279566466808319, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7967, "grad_norm": 0.29044675827026367, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 8614.675071716309, "total_accumulated_duration": 8614.675071716309, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0849, "grad_norm": 0.7493793964385986, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4869, "grad_norm": 0.4989018440246582, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2871, "grad_norm": 0.5077806711196899, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0476, "grad_norm": 0.6642375588417053, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9592, "grad_norm": 0.45972347259521484, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.9894, "grad_norm": 0.4688694179058075, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9818, "grad_norm": 0.5043684840202332, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9287, "grad_norm": 0.4621885418891907, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9208, "grad_norm": 0.42979180812835693, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8931, "grad_norm": 0.4064626395702362, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9035, "grad_norm": 0.40199074149131775, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8504, "grad_norm": 0.44314295053482056, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3227830231189728, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8483, "grad_norm": 0.4403241276741028, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7946, "grad_norm": 0.35792240500450134, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8478, "grad_norm": 0.3626907765865326, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8466, "grad_norm": 0.43099069595336914, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8434, "grad_norm": 0.3468930423259735, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8898, "grad_norm": 0.36435553431510925, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7698, "grad_norm": 0.45654407143592834, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8448, "grad_norm": 0.3559560179710388, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8808, "grad_norm": 0.42246749997138977, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7555, "grad_norm": 0.35775721073150635, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.8263, "grad_norm": 0.3552558422088623, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8525, "grad_norm": 0.33366483449935913, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7423, "grad_norm": 0.3529402017593384, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8591, "grad_norm": 0.35132497549057007, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8538, "grad_norm": 0.35935965180397034, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8453, "grad_norm": 0.32442131638526917, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.3304824233055115, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7609, "grad_norm": 0.3343234956264496, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8234, "grad_norm": 0.42407166957855225, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8189, "grad_norm": 0.3250719904899597, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8235, "grad_norm": 0.3359096348285675, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9144, "grad_norm": 0.36697816848754883, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7934, "grad_norm": 0.3180158734321594, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8631, "grad_norm": 0.3829543888568878, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8041, "grad_norm": 0.3221031725406647, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9187, "grad_norm": 0.31730103492736816, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7975, "grad_norm": 0.3535255789756775, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8966, "grad_norm": 0.35524800419807434, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8135, "grad_norm": 0.442055344581604, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8574, "grad_norm": 0.3454146087169647, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8531, "grad_norm": 0.33116260170936584, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8503, "grad_norm": 0.33906036615371704, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8453, "grad_norm": 0.35210177302360535, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7888, "grad_norm": 0.3600303530693054, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3073132634162903, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7825, "grad_norm": 0.4066285490989685, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8186, "grad_norm": 0.7604546546936035, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8051, "grad_norm": 0.30416470766067505, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6934, "grad_norm": 0.31779664754867554, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8255, "grad_norm": 0.32126376032829285, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7717, "grad_norm": 0.30189672112464905, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7785, "grad_norm": 0.28167518973350525, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8229, "grad_norm": 0.3048870265483856, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8348, "grad_norm": 0.3255627453327179, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8314, "grad_norm": 0.3127131462097168, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7036, "grad_norm": 0.3731038570404053, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7655, "grad_norm": 0.34647998213768005, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8374, "grad_norm": 0.3332764208316803, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7265, "grad_norm": 0.3403870463371277, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8279, "grad_norm": 0.3102012574672699, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8047, "grad_norm": 0.3112141489982605, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7442, "grad_norm": 0.35278451442718506, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8633, "grad_norm": 0.27916669845581055, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.841, "grad_norm": 0.29995933175086975, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8432, "grad_norm": 0.30316805839538574, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7848, "grad_norm": 0.31463387608528137, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8466, "grad_norm": 0.2990514934062958, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7844, "grad_norm": 0.2812942564487457, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8101, "grad_norm": 0.3082747459411621, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8038, "grad_norm": 0.29814761877059937, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7691, "grad_norm": 0.2816784381866455, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.739, "grad_norm": 0.3230450749397278, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7612, "grad_norm": 0.28781887888908386, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.837, "grad_norm": 0.2953524887561798, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8213, "grad_norm": 0.3054346442222595, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.808, "grad_norm": 0.2908236086368561, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.31755733489990234, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7927, "grad_norm": 0.26710301637649536, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7626, "grad_norm": 0.3451886773109436, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7533, "grad_norm": 0.2959114611148834, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7551, "grad_norm": 0.31711632013320923, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7905, "grad_norm": 0.34547725319862366, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7484, "grad_norm": 0.32684141397476196, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7154, "grad_norm": 0.3261931538581848, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7823, "grad_norm": 0.28676143288612366, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7099, "grad_norm": 0.3189515173435211, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.772, "grad_norm": 0.30141714215278625, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7233, "grad_norm": 0.336100310087204, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.781, "grad_norm": 0.3328329026699066, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8258, "grad_norm": 0.31725209951400757, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8332, "grad_norm": 0.3101555407047272, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8226, "grad_norm": 0.3060206174850464, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8227, "grad_norm": 0.289568156003952, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7728, "grad_norm": 0.3252294957637787, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8063, "grad_norm": 0.3643883168697357, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8899, "grad_norm": 0.313032329082489, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8478, "grad_norm": 0.299296498298645, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.8003, "grad_norm": 0.32191869616508484, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7958, "grad_norm": 0.2850085198879242, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 9131.119746685028, "total_accumulated_duration": 9131.119746685028, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0805, "grad_norm": 1.4004040956497192, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.477, "grad_norm": 0.49528834223747253, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2657, "grad_norm": 0.5620803833007812, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0145, "grad_norm": 0.5206792950630188, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9441, "grad_norm": 0.5192418098449707, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.9848, "grad_norm": 0.4760245680809021, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9792, "grad_norm": 0.49701640009880066, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9248, "grad_norm": 0.426954060792923, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9176, "grad_norm": 0.41753074526786804, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8882, "grad_norm": 0.43176209926605225, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9034, "grad_norm": 0.4172838032245636, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8503, "grad_norm": 0.37853118777275085, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.8993, "grad_norm": 0.3643597662448883, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8498, "grad_norm": 0.3552045226097107, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.793, "grad_norm": 0.3611885905265808, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8489, "grad_norm": 0.35341012477874756, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8461, "grad_norm": 0.411731481552124, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8408, "grad_norm": 0.3752767741680145, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8867, "grad_norm": 0.3459947109222412, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7685, "grad_norm": 0.3611522316932678, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8399, "grad_norm": 0.3513851463794708, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8785, "grad_norm": 0.37236350774765015, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7563, "grad_norm": 0.34816354513168335, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.8209, "grad_norm": 0.3423711955547333, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8506, "grad_norm": 0.3034679889678955, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7423, "grad_norm": 0.3754836320877075, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8595, "grad_norm": 0.3404528498649597, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8526, "grad_norm": 0.3615986108779907, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8456, "grad_norm": 0.3140770494937897, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.8208, "grad_norm": 0.32112881541252136, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7583, "grad_norm": 0.34159770607948303, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8255, "grad_norm": 0.9457993507385254, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8174, "grad_norm": 0.3212255537509918, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8235, "grad_norm": 0.34061118960380554, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9153, "grad_norm": 0.3667244017124176, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7942, "grad_norm": 0.3188079595565796, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8638, "grad_norm": 0.35733574628829956, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.7988, "grad_norm": 0.30891889333724976, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.921, "grad_norm": 0.29940274357795715, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7979, "grad_norm": 0.31464695930480957, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8985, "grad_norm": 0.3302725851535797, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8103, "grad_norm": 0.39097800850868225, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8554, "grad_norm": 0.3535297214984894, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8539, "grad_norm": 0.3308466672897339, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8517, "grad_norm": 0.3058496415615082, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8491, "grad_norm": 0.7465469241142273, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7907, "grad_norm": 0.3629920482635498, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7578, "grad_norm": 0.324266254901886, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7821, "grad_norm": 0.3932960331439972, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8214, "grad_norm": 0.3609100580215454, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8025, "grad_norm": 0.29935353994369507, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6927, "grad_norm": 0.3104528784751892, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8232, "grad_norm": 0.32785764336586, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.772, "grad_norm": 0.3005681037902832, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7782, "grad_norm": 0.2888568341732025, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8191, "grad_norm": 0.32374313473701477, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8334, "grad_norm": 0.3289007842540741, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.831, "grad_norm": 0.3174697160720825, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7025, "grad_norm": 0.34157034754753113, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7634, "grad_norm": 0.33474043011665344, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8317, "grad_norm": 0.3204871118068695, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7249, "grad_norm": 0.3411032259464264, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.832, "grad_norm": 0.33133742213249207, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8068, "grad_norm": 0.32993486523628235, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7421, "grad_norm": 0.3632366359233856, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8629, "grad_norm": 0.2831084132194519, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8406, "grad_norm": 0.3102332353591919, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8444, "grad_norm": 0.2975476086139679, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7871, "grad_norm": 0.32190340757369995, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.544466495513916, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7835, "grad_norm": 0.2760232388973236, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8107, "grad_norm": 0.29771652817726135, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8015, "grad_norm": 0.30051174759864807, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7713, "grad_norm": 0.2826128900051117, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7416, "grad_norm": 0.335135281085968, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.763, "grad_norm": 0.28276902437210083, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8398, "grad_norm": 0.29236480593681335, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8163, "grad_norm": 0.28522199392318726, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8069, "grad_norm": 0.28323712944984436, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7938, "grad_norm": 0.30547669529914856, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7934, "grad_norm": 0.27654391527175903, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7623, "grad_norm": 0.3366940915584564, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.753, "grad_norm": 0.2769072353839874, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7555, "grad_norm": 0.32644131779670715, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7931, "grad_norm": 0.32108405232429504, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7489, "grad_norm": 0.3608355224132538, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.3408980667591095, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.782, "grad_norm": 0.2861264944076538, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7092, "grad_norm": 0.3119470477104187, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7732, "grad_norm": 0.35495609045028687, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.724, "grad_norm": 0.3283773362636566, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7782, "grad_norm": 0.34948012232780457, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8235, "grad_norm": 0.31886062026023865, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8344, "grad_norm": 0.31308072805404663, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8224, "grad_norm": 0.3056151866912842, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8239, "grad_norm": 0.28585341572761536, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7766, "grad_norm": 0.31313595175743103, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8047, "grad_norm": 0.3548518121242523, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8885, "grad_norm": 0.3156449496746063, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8459, "grad_norm": 0.30134329199790955, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7983, "grad_norm": 0.3213663101196289, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7925, "grad_norm": 0.2871834337711334, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 7805.627238512039, "total_accumulated_duration": 7805.627238512039, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0792, "grad_norm": 0.7775738835334778, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4866, "grad_norm": 0.5583823323249817, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.265, "grad_norm": 0.5392235517501831, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0227, "grad_norm": 0.5747509598731995, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.947, "grad_norm": 0.43950968980789185, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.9836, "grad_norm": 0.46261805295944214, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9802, "grad_norm": 0.4872327446937561, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9254, "grad_norm": 0.4279601573944092, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9145, "grad_norm": 0.40902411937713623, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8922, "grad_norm": 0.3914395868778229, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9064, "grad_norm": 0.4119057059288025, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8464, "grad_norm": 0.36096957325935364, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.8976, "grad_norm": 0.3246411681175232, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8445, "grad_norm": 0.3493919372558594, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7908, "grad_norm": 0.38220298290252686, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.848, "grad_norm": 0.35742080211639404, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.846, "grad_norm": 0.8206163048744202, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8433, "grad_norm": 0.41155320405960083, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8887, "grad_norm": 0.36376968026161194, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7691, "grad_norm": 0.3520759046077728, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8431, "grad_norm": 0.3779120147228241, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8806, "grad_norm": 0.36827215552330017, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7555, "grad_norm": 0.35667774081230164, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.8234, "grad_norm": 0.3537030518054962, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8506, "grad_norm": 0.3095904290676117, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.74, "grad_norm": 0.3664686679840088, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8595, "grad_norm": 0.3403360843658447, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8514, "grad_norm": 0.35468989610671997, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.846, "grad_norm": 0.33379730582237244, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.8197, "grad_norm": 0.3371599614620209, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7603, "grad_norm": 0.345524400472641, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8263, "grad_norm": 0.33196690678596497, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8161, "grad_norm": 0.33003339171409607, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8261, "grad_norm": 0.34377405047416687, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9168, "grad_norm": 0.38273748755455017, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7992, "grad_norm": 0.31873393058776855, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8673, "grad_norm": 0.4106711149215698, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.802, "grad_norm": 0.3180122673511505, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9229, "grad_norm": 0.30043068528175354, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7983, "grad_norm": 0.3154022991657257, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.9015, "grad_norm": 0.3352239727973938, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8102, "grad_norm": 0.44572168588638306, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8588, "grad_norm": 0.3307209312915802, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8497, "grad_norm": 0.3264922499656677, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8515, "grad_norm": 0.3088473081588745, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8498, "grad_norm": 0.9683569073677063, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7902, "grad_norm": 0.38158977031707764, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7569, "grad_norm": 0.3046931326389313, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7835, "grad_norm": 0.3930199444293976, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8243, "grad_norm": 0.4819939136505127, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8042, "grad_norm": 0.30577245354652405, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6942, "grad_norm": 0.3192637264728546, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8252, "grad_norm": 0.3385009467601776, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7762, "grad_norm": 0.3092128336429596, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7805, "grad_norm": 0.29258453845977783, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8223, "grad_norm": 0.30027705430984497, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8365, "grad_norm": 0.3087688982486725, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8318, "grad_norm": 0.6585562229156494, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7023, "grad_norm": 0.34260162711143494, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7602, "grad_norm": 0.3322044909000397, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.835, "grad_norm": 0.3490358591079712, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7277, "grad_norm": 0.33683666586875916, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8276, "grad_norm": 0.33935579657554626, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8063, "grad_norm": 0.3583163917064667, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7456, "grad_norm": 0.3733830153942108, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8643, "grad_norm": 0.2808888256549835, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8403, "grad_norm": 0.29853978753089905, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8423, "grad_norm": 0.2974698841571808, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7887, "grad_norm": 0.31382060050964355, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8474, "grad_norm": 0.3384411334991455, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7824, "grad_norm": 0.279596209526062, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8112, "grad_norm": 0.2944697141647339, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8058, "grad_norm": 0.3064538538455963, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7707, "grad_norm": 0.28189340233802795, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3372725546360016, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7594, "grad_norm": 0.2781224250793457, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.836, "grad_norm": 0.2901884913444519, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8183, "grad_norm": 0.2983466386795044, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8083, "grad_norm": 0.28513798117637634, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.795, "grad_norm": 0.32360851764678955, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7908, "grad_norm": 0.274665892124176, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7632, "grad_norm": 0.3458949029445648, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7529, "grad_norm": 0.2805227041244507, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31838613748550415, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7912, "grad_norm": 0.3196384012699127, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7497, "grad_norm": 0.3167778253555298, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7159, "grad_norm": 0.3295934498310089, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7826, "grad_norm": 0.2816080152988434, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7086, "grad_norm": 0.3783347010612488, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.773, "grad_norm": 0.31769317388534546, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7215, "grad_norm": 0.3217010796070099, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7808, "grad_norm": 0.44784948229789734, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8243, "grad_norm": 0.311459481716156, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.835, "grad_norm": 0.31605151295661926, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8204, "grad_norm": 0.2969941794872284, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8237, "grad_norm": 0.29052597284317017, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7735, "grad_norm": 0.5308746099472046, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8038, "grad_norm": 0.3567570745944977, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8888, "grad_norm": 0.3252796232700348, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8484, "grad_norm": 0.4144253432750702, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.798, "grad_norm": 0.3273269236087799, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7948, "grad_norm": 0.2791995108127594, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 5217.032011985779, "total_accumulated_duration": 5217.032011985779, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0768, "grad_norm": 0.7021015882492065, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4631, "grad_norm": 0.5293550491333008, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2465, "grad_norm": 0.7597886323928833, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0107, "grad_norm": 0.5716570019721985, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9413, "grad_norm": 0.42367565631866455, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.9828, "grad_norm": 0.4324796199798584, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.981, "grad_norm": 0.5050645470619202, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9234, "grad_norm": 0.42312121391296387, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9152, "grad_norm": 0.41372519731521606, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8904, "grad_norm": 0.3871922194957733, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9072, "grad_norm": 0.40193668007850647, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8457, "grad_norm": 0.3660767078399658, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.8967, "grad_norm": 0.31849604845046997, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8465, "grad_norm": 0.34399211406707764, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7901, "grad_norm": 0.32782959938049316, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8499, "grad_norm": 0.33730486035346985, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8507, "grad_norm": 0.47183865308761597, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8449, "grad_norm": 0.41224056482315063, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8872, "grad_norm": 0.3493252098560333, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7698, "grad_norm": 0.3443562686443329, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8442, "grad_norm": 0.3719293177127838, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8795, "grad_norm": 0.37245652079582214, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7547, "grad_norm": 0.3301680386066437, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.8252, "grad_norm": 0.36012348532676697, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8539, "grad_norm": 0.43787631392478943, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7411, "grad_norm": 0.40697792172431946, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8604, "grad_norm": 0.37754276394844055, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8508, "grad_norm": 0.37219709157943726, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8499, "grad_norm": 0.32134678959846497, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.8204, "grad_norm": 0.323728084564209, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7651, "grad_norm": 0.3529917895793915, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8289, "grad_norm": 0.3383786976337433, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8153, "grad_norm": 0.32772132754325867, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8242, "grad_norm": 0.337752103805542, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9168, "grad_norm": 0.35874703526496887, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.8002, "grad_norm": 0.32408469915390015, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8643, "grad_norm": 0.35104885697364807, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8018, "grad_norm": 0.30929499864578247, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9232, "grad_norm": 0.3031355142593384, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7957, "grad_norm": 0.3190555274486542, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8983, "grad_norm": 0.44238677620887756, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8108, "grad_norm": 0.3833184838294983, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8567, "grad_norm": 0.3284796476364136, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8518, "grad_norm": 0.337785929441452, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8543, "grad_norm": 0.3230440020561218, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8484, "grad_norm": 0.4421052634716034, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7897, "grad_norm": 0.35508641600608826, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7569, "grad_norm": 0.2951701879501343, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7841, "grad_norm": 0.4701220393180847, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8226, "grad_norm": 0.3613520860671997, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8047, "grad_norm": 0.302544504404068, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6907, "grad_norm": 0.3328293561935425, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8228, "grad_norm": 0.3272629976272583, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7731, "grad_norm": 0.31455153226852417, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.78, "grad_norm": 0.2919689118862152, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.30982503294944763, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8361, "grad_norm": 0.31483978033065796, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8301, "grad_norm": 0.33073070645332336, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7029, "grad_norm": 0.34696322679519653, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7615, "grad_norm": 0.3365851640701294, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8371, "grad_norm": 0.3475584089756012, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7241, "grad_norm": 0.711076021194458, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.829, "grad_norm": 0.31679967045783997, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8055, "grad_norm": 0.3273877203464508, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7441, "grad_norm": 0.5061460137367249, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8649, "grad_norm": 0.2813282907009125, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8429, "grad_norm": 0.30541664361953735, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8458, "grad_norm": 0.3182958662509918, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7871, "grad_norm": 0.31644582748413086, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8458, "grad_norm": 0.29679274559020996, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7837, "grad_norm": 0.27372097969055176, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8114, "grad_norm": 0.3357728123664856, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8047, "grad_norm": 0.3300599157810211, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7673, "grad_norm": 0.27955976128578186, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7373, "grad_norm": 0.32679328322410583, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7619, "grad_norm": 0.2867962121963501, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8399, "grad_norm": 0.3120456039905548, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8224, "grad_norm": 0.2997247576713562, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8069, "grad_norm": 0.28892984986305237, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7963, "grad_norm": 0.3257523775100708, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7935, "grad_norm": 0.26903730630874634, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7628, "grad_norm": 0.33329081535339355, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7534, "grad_norm": 0.28643161058425903, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7525, "grad_norm": 0.31738072633743286, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7929, "grad_norm": 0.3202503025531769, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7488, "grad_norm": 0.329441100358963, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7152, "grad_norm": 0.33071649074554443, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7853, "grad_norm": 0.29738152027130127, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7115, "grad_norm": 0.3103555738925934, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7728, "grad_norm": 0.3052198588848114, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7216, "grad_norm": 0.31931647658348083, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.782, "grad_norm": 0.33478689193725586, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8271, "grad_norm": 0.32925254106521606, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8361, "grad_norm": 0.34086015820503235, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8234, "grad_norm": 0.3911075294017792, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8241, "grad_norm": 0.28895238041877747, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7749, "grad_norm": 0.3233981728553772, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8016, "grad_norm": 0.35691210627555847, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8894, "grad_norm": 0.31073638796806335, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8502, "grad_norm": 0.2948456108570099, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7993, "grad_norm": 0.3256973326206207, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7939, "grad_norm": 0.2843184173107147, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 5042.019628286362, "total_accumulated_duration": 5042.019628286362, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0786, "grad_norm": 0.6911329030990601, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4641, "grad_norm": 0.5412330627441406, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.267, "grad_norm": 0.583145022392273, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.024, "grad_norm": 0.5168295502662659, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9514, "grad_norm": 0.4544302821159363, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.989, "grad_norm": 0.45545899868011475, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9795, "grad_norm": 0.47717148065567017, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9267, "grad_norm": 0.43077918887138367, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9174, "grad_norm": 0.41169410943984985, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8914, "grad_norm": 0.4109228253364563, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9063, "grad_norm": 0.727739155292511, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.849, "grad_norm": 0.39693373441696167, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.8961, "grad_norm": 0.32479676604270935, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.8476, "grad_norm": 0.41577887535095215, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7912, "grad_norm": 0.3692895770072937, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8483, "grad_norm": 0.3582785427570343, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8469, "grad_norm": 0.43169599771499634, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.8429, "grad_norm": 0.38158753514289856, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8899, "grad_norm": 0.3516024053096771, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7692, "grad_norm": 0.4199542701244354, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8455, "grad_norm": 0.35992029309272766, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8784, "grad_norm": 0.40170818567276, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7562, "grad_norm": 0.34489646553993225, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.825, "grad_norm": 0.36570557951927185, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8508, "grad_norm": 0.3102056682109833, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7412, "grad_norm": 0.34393879771232605, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8589, "grad_norm": 0.3517219126224518, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8518, "grad_norm": 0.3594422936439514, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.32002630829811096, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.8205, "grad_norm": 0.3338747024536133, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7622, "grad_norm": 0.3600769639015198, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8237, "grad_norm": 0.3585824966430664, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8172, "grad_norm": 0.333251029253006, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8226, "grad_norm": 0.347112238407135, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9134, "grad_norm": 0.3596053421497345, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7916, "grad_norm": 0.3617101311683655, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8656, "grad_norm": 0.36017975211143494, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8021, "grad_norm": 0.31422123312950134, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9224, "grad_norm": 0.3083730638027191, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7993, "grad_norm": 0.31650277972221375, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.9, "grad_norm": 0.3626275062561035, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8122, "grad_norm": 0.4097851812839508, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8552, "grad_norm": 0.32672059535980225, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8498, "grad_norm": 0.3282705843448639, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8484, "grad_norm": 0.3159716725349426, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8478, "grad_norm": 0.4060676097869873, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7905, "grad_norm": 0.3285563886165619, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7555, "grad_norm": 0.3229023814201355, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7853, "grad_norm": 0.4205825924873352, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8225, "grad_norm": 0.3491184115409851, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8043, "grad_norm": 0.30142080783843994, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6917, "grad_norm": 0.32370421290397644, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8231, "grad_norm": 0.3260400891304016, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.771, "grad_norm": 0.30405792593955994, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7804, "grad_norm": 0.3368788957595825, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8228, "grad_norm": 0.29851457476615906, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8358, "grad_norm": 0.3424811065196991, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8333, "grad_norm": 0.3058519661426544, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7032, "grad_norm": 0.3714658319950104, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.7629, "grad_norm": 0.33727630972862244, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8341, "grad_norm": 0.32690030336380005, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.727, "grad_norm": 0.33565303683280945, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8283, "grad_norm": 0.3237195909023285, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8053, "grad_norm": 0.3161337375640869, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7407, "grad_norm": 0.7614491581916809, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8636, "grad_norm": 0.2802790105342865, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8411, "grad_norm": 0.330805242061615, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.8439, "grad_norm": 0.2941242456436157, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7857, "grad_norm": 0.32636013627052307, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8445, "grad_norm": 0.3066718876361847, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7808, "grad_norm": 0.2920783460140228, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8099, "grad_norm": 0.2931442856788635, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8038, "grad_norm": 0.314094215631485, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7718, "grad_norm": 0.2800949215888977, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7382, "grad_norm": 0.3249237537384033, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7624, "grad_norm": 0.2794058918952942, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8376, "grad_norm": 0.37049400806427, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8185, "grad_norm": 0.2909463047981262, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8073, "grad_norm": 0.28539058566093445, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7946, "grad_norm": 0.3309869170188904, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7916, "grad_norm": 0.2745763957500458, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7661, "grad_norm": 0.363674521446228, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7543, "grad_norm": 0.2761007249355316, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7543, "grad_norm": 0.5074517130851746, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.7935, "grad_norm": 0.32050713896751404, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.749, "grad_norm": 0.3138967752456665, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.714, "grad_norm": 0.31975120306015015, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7818, "grad_norm": 0.3553573787212372, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7097, "grad_norm": 0.35411757230758667, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7703, "grad_norm": 0.2980964183807373, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7259, "grad_norm": 0.3381917476654053, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7781, "grad_norm": 0.3236033320426941, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8248, "grad_norm": 0.3197905123233795, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8367, "grad_norm": 0.31410661339759827, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.8219, "grad_norm": 0.3096868395805359, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8244, "grad_norm": 0.28688159584999084, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7729, "grad_norm": 0.5106386542320251, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8041, "grad_norm": 0.38583698868751526, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8887, "grad_norm": 0.32006171345710754, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8518, "grad_norm": 0.3037484586238861, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7987, "grad_norm": 0.32314303517341614, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7957, "grad_norm": 0.28476133942604065, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 0.9995129079396006, "step": 1026, "epoch_duration": 1682.2428288459778, "total_accumulated_duration": 1682.2428288459778, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}]} +{"epoch": 2.0, "step": 2053, "epoch_duration": 2075.387904882431, "total_accumulated_duration": 3757.630733728409, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}]} +{"epoch": 2.9995129079396006, "step": 3079, "epoch_duration": 2152.3596365451813, "total_accumulated_duration": 5909.99037027359, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}]} +{"epoch": 4.0, "step": 4106, "epoch_duration": 1664.8996212482452, "total_accumulated_duration": 7574.889991521835, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}, {"eval_loss": 1.8463934659957886, "eval_runtime": 56.2401, "eval_samples_per_second": 9.015, "eval_steps_per_second": 1.138, "epoch": 2.9995129079396006, "step": 3079}, {"loss": 1.5744, "grad_norm": 0.49992576241493225, "learning_rate": 0.0002, "epoch": 3.0004870920603994, "step": 3080}, {"loss": 1.4125, "grad_norm": 0.8242783546447754, "learning_rate": 0.0002, "epoch": 3.0102289332683876, "step": 3090}, {"loss": 1.394, "grad_norm": 0.6330569386482239, "learning_rate": 0.0002, "epoch": 3.019970774476376, "step": 3100}, {"loss": 1.4942, "grad_norm": 0.566097617149353, "learning_rate": 0.0002, "epoch": 3.0297126156843643, "step": 3110}, {"loss": 1.4365, "grad_norm": 0.6337586045265198, "learning_rate": 0.0002, "epoch": 3.0394544568923525, "step": 3120}, {"loss": 1.3916, "grad_norm": 0.7339403033256531, "learning_rate": 0.0002, "epoch": 3.049196298100341, "step": 3130}, {"loss": 1.4617, "grad_norm": 0.7187346816062927, "learning_rate": 0.0002, "epoch": 3.0589381393083293, "step": 3140}, {"loss": 1.3453, "grad_norm": 0.7116255760192871, "learning_rate": 0.0002, "epoch": 3.0686799805163174, "step": 3150}, {"loss": 1.4452, "grad_norm": 0.6493807435035706, "learning_rate": 0.0002, "epoch": 3.078421821724306, "step": 3160}, {"loss": 1.351, "grad_norm": 0.6777266263961792, "learning_rate": 0.0002, "epoch": 3.088163662932294, "step": 3170}, {"loss": 1.4362, "grad_norm": 0.6342006325721741, "learning_rate": 0.0002, "epoch": 3.0979055041402823, "step": 3180}, {"loss": 1.4748, "grad_norm": 0.6608964204788208, "learning_rate": 0.0002, "epoch": 3.107647345348271, "step": 3190}, {"loss": 1.375, "grad_norm": 0.7230247259140015, "learning_rate": 0.0002, "epoch": 3.117389186556259, "step": 3200}, {"loss": 1.4049, "grad_norm": 0.650368332862854, "learning_rate": 0.0002, "epoch": 3.1271310277642472, "step": 3210}, {"loss": 1.409, "grad_norm": 0.7319342494010925, "learning_rate": 0.0002, "epoch": 3.136872868972236, "step": 3220}, {"loss": 1.3872, "grad_norm": 0.7159963846206665, "learning_rate": 0.0002, "epoch": 3.146614710180224, "step": 3230}, {"loss": 1.5076, "grad_norm": 0.8905230164527893, "learning_rate": 0.0002, "epoch": 3.156356551388212, "step": 3240}, {"loss": 1.3161, "grad_norm": 0.6920804381370544, "learning_rate": 0.0002, "epoch": 3.1660983925962007, "step": 3250}, {"loss": 1.3786, "grad_norm": 0.6782063841819763, "learning_rate": 0.0002, "epoch": 3.175840233804189, "step": 3260}, {"loss": 1.5153, "grad_norm": 0.735325276851654, "learning_rate": 0.0002, "epoch": 3.1855820750121775, "step": 3270}, {"loss": 1.4027, "grad_norm": 0.6657978296279907, "learning_rate": 0.0002, "epoch": 3.1953239162201656, "step": 3280}, {"loss": 1.3456, "grad_norm": 0.771315336227417, "learning_rate": 0.0002, "epoch": 3.205065757428154, "step": 3290}, {"loss": 1.3236, "grad_norm": 0.6492983102798462, "learning_rate": 0.0002, "epoch": 3.2148075986361424, "step": 3300}, {"loss": 1.4125, "grad_norm": 0.7513770461082458, "learning_rate": 0.0002, "epoch": 3.2245494398441306, "step": 3310}, {"loss": 1.4032, "grad_norm": 0.7091423869132996, "learning_rate": 0.0002, "epoch": 3.2342912810521187, "step": 3320}, {"loss": 1.4585, "grad_norm": 0.6663975119590759, "learning_rate": 0.0002, "epoch": 3.2440331222601073, "step": 3330}, {"loss": 1.3968, "grad_norm": 0.6813122034072876, "learning_rate": 0.0002, "epoch": 3.2537749634680955, "step": 3340}, {"loss": 1.3681, "grad_norm": 0.6602569818496704, "learning_rate": 0.0002, "epoch": 3.2635168046760836, "step": 3350}, {"loss": 1.4533, "grad_norm": 0.718270480632782, "learning_rate": 0.0002, "epoch": 3.2732586458840722, "step": 3360}, {"loss": 1.4076, "grad_norm": 0.6884173154830933, "learning_rate": 0.0002, "epoch": 3.2830004870920604, "step": 3370}, {"loss": 1.4144, "grad_norm": 0.7039775848388672, "learning_rate": 0.0002, "epoch": 3.2927423283000485, "step": 3380}, {"loss": 1.5077, "grad_norm": 0.7444299459457397, "learning_rate": 0.0002, "epoch": 3.302484169508037, "step": 3390}, {"loss": 1.4255, "grad_norm": 0.7187064290046692, "learning_rate": 0.0002, "epoch": 3.3122260107160253, "step": 3400}, {"loss": 1.3684, "grad_norm": 0.599396288394928, "learning_rate": 0.0002, "epoch": 3.3219678519240134, "step": 3410}, {"loss": 1.4819, "grad_norm": 0.7670390009880066, "learning_rate": 0.0002, "epoch": 3.331709693132002, "step": 3420}, {"loss": 1.4411, "grad_norm": 0.6654478311538696, "learning_rate": 0.0002, "epoch": 3.34145153433999, "step": 3430}, {"loss": 1.4257, "grad_norm": 0.6644385457038879, "learning_rate": 0.0002, "epoch": 3.351193375547979, "step": 3440}, {"loss": 1.4508, "grad_norm": 0.6974098086357117, "learning_rate": 0.0002, "epoch": 3.360935216755967, "step": 3450}, {"loss": 1.3807, "grad_norm": 0.7350399494171143, "learning_rate": 0.0002, "epoch": 3.370677057963955, "step": 3460}, {"loss": 1.4176, "grad_norm": 0.714721143245697, "learning_rate": 0.0002, "epoch": 3.3804188991719437, "step": 3470}, {"loss": 1.4325, "grad_norm": 0.7006027698516846, "learning_rate": 0.0002, "epoch": 3.390160740379932, "step": 3480}, {"loss": 1.4888, "grad_norm": 0.6767925024032593, "learning_rate": 0.0002, "epoch": 3.39990258158792, "step": 3490}, {"loss": 1.4116, "grad_norm": 0.6721355319023132, "learning_rate": 0.0002, "epoch": 3.4096444227959086, "step": 3500}, {"loss": 1.443, "grad_norm": 0.6845725178718567, "learning_rate": 0.0002, "epoch": 3.419386264003897, "step": 3510}, {"loss": 1.4832, "grad_norm": 0.6882196664810181, "learning_rate": 0.0002, "epoch": 3.429128105211885, "step": 3520}, {"loss": 1.4962, "grad_norm": 0.7663240432739258, "learning_rate": 0.0002, "epoch": 3.4388699464198735, "step": 3530}, {"loss": 1.4644, "grad_norm": 0.6304219365119934, "learning_rate": 0.0002, "epoch": 3.4486117876278617, "step": 3540}, {"loss": 1.4918, "grad_norm": 0.668678879737854, "learning_rate": 0.0002, "epoch": 3.45835362883585, "step": 3550}, {"loss": 1.4874, "grad_norm": 0.7526912093162537, "learning_rate": 0.0002, "epoch": 3.4680954700438384, "step": 3560}, {"loss": 1.4249, "grad_norm": 1.089495301246643, "learning_rate": 0.0002, "epoch": 3.4778373112518266, "step": 3570}, {"loss": 1.3871, "grad_norm": 0.7282902002334595, "learning_rate": 0.0002, "epoch": 3.4875791524598148, "step": 3580}, {"loss": 1.5077, "grad_norm": 0.6540156602859497, "learning_rate": 0.0002, "epoch": 3.4973209936678034, "step": 3590}, {"loss": 1.4367, "grad_norm": 0.6449568867683411, "learning_rate": 0.0002, "epoch": 3.5070628348757915, "step": 3600}, {"loss": 1.4532, "grad_norm": 0.7262216210365295, "learning_rate": 0.0002, "epoch": 3.5168046760837797, "step": 3610}, {"loss": 1.4374, "grad_norm": 0.6048615574836731, "learning_rate": 0.0002, "epoch": 3.5265465172917683, "step": 3620}, {"loss": 1.3877, "grad_norm": 0.6780537366867065, "learning_rate": 0.0002, "epoch": 3.5362883584997564, "step": 3630}, {"loss": 1.422, "grad_norm": 0.6851925253868103, "learning_rate": 0.0002, "epoch": 3.5460301997077446, "step": 3640}, {"loss": 1.3425, "grad_norm": 0.6530634164810181, "learning_rate": 0.0002, "epoch": 3.555772040915733, "step": 3650}, {"loss": 1.4879, "grad_norm": 0.7193992733955383, "learning_rate": 0.0002, "epoch": 3.5655138821237213, "step": 3660}, {"loss": 1.4555, "grad_norm": 0.767496645450592, "learning_rate": 0.0002, "epoch": 3.5752557233317095, "step": 3670}, {"loss": 1.4824, "grad_norm": 0.6912919282913208, "learning_rate": 0.0002, "epoch": 3.584997564539698, "step": 3680}, {"loss": 1.4497, "grad_norm": 0.7383436560630798, "learning_rate": 0.0002, "epoch": 3.5947394057476862, "step": 3690}, {"loss": 1.4822, "grad_norm": 0.6746662855148315, "learning_rate": 0.0002, "epoch": 3.6044812469556744, "step": 3700}, {"loss": 1.4904, "grad_norm": 0.6885138750076294, "learning_rate": 0.0002, "epoch": 3.614223088163663, "step": 3710}, {"loss": 1.4044, "grad_norm": 0.6694392561912537, "learning_rate": 0.0002, "epoch": 3.623964929371651, "step": 3720}, {"loss": 1.3719, "grad_norm": 0.812358021736145, "learning_rate": 0.0002, "epoch": 3.6337067705796393, "step": 3730}, {"loss": 1.4603, "grad_norm": 0.7267130017280579, "learning_rate": 0.0002, "epoch": 3.643448611787628, "step": 3740}, {"loss": 1.4574, "grad_norm": 0.6958749294281006, "learning_rate": 0.0002, "epoch": 3.653190452995616, "step": 3750}, {"loss": 1.4346, "grad_norm": 0.6805673241615295, "learning_rate": 0.0002, "epoch": 3.6629322942036042, "step": 3760}, {"loss": 1.4338, "grad_norm": 0.7184410095214844, "learning_rate": 0.0002, "epoch": 3.672674135411593, "step": 3770}, {"loss": 1.3935, "grad_norm": 0.7716330289840698, "learning_rate": 0.0002, "epoch": 3.682415976619581, "step": 3780}, {"loss": 1.384, "grad_norm": 0.6675831079483032, "learning_rate": 0.0002, "epoch": 3.6921578178275696, "step": 3790}, {"loss": 1.401, "grad_norm": 0.6480095386505127, "learning_rate": 0.0002, "epoch": 3.7018996590355577, "step": 3800}, {"loss": 1.5303, "grad_norm": 0.6559418439865112, "learning_rate": 0.0002, "epoch": 3.711641500243546, "step": 3810}, {"loss": 1.4341, "grad_norm": 0.6596545577049255, "learning_rate": 0.0002, "epoch": 3.7213833414515345, "step": 3820}, {"loss": 1.4508, "grad_norm": 0.7172950506210327, "learning_rate": 0.0002, "epoch": 3.7311251826595226, "step": 3830}, {"loss": 1.446, "grad_norm": 0.796148419380188, "learning_rate": 0.0002, "epoch": 3.740867023867511, "step": 3840}, {"loss": 1.4992, "grad_norm": 0.6600322723388672, "learning_rate": 0.0002, "epoch": 3.7506088650754994, "step": 3850}, {"loss": 1.4201, "grad_norm": 0.6776387691497803, "learning_rate": 0.0002, "epoch": 3.7603507062834876, "step": 3860}, {"loss": 1.3893, "grad_norm": 0.7768304347991943, "learning_rate": 0.0002, "epoch": 3.770092547491476, "step": 3870}, {"loss": 1.4886, "grad_norm": 1.0579794645309448, "learning_rate": 0.0002, "epoch": 3.7798343886994643, "step": 3880}, {"loss": 1.4556, "grad_norm": 0.6757252812385559, "learning_rate": 0.0002, "epoch": 3.7895762299074525, "step": 3890}, {"loss": 1.4647, "grad_norm": 0.6706996560096741, "learning_rate": 0.0002, "epoch": 3.799318071115441, "step": 3900}, {"loss": 1.4104, "grad_norm": 0.7026948928833008, "learning_rate": 0.0002, "epoch": 3.809059912323429, "step": 3910}, {"loss": 1.5487, "grad_norm": 0.6437768340110779, "learning_rate": 0.0002, "epoch": 3.8188017535314174, "step": 3920}, {"loss": 1.4678, "grad_norm": 0.7015706300735474, "learning_rate": 0.0002, "epoch": 3.828543594739406, "step": 3930}, {"loss": 1.4891, "grad_norm": 0.7049482464790344, "learning_rate": 0.0002, "epoch": 3.838285435947394, "step": 3940}, {"loss": 1.4208, "grad_norm": 0.6533724665641785, "learning_rate": 0.0002, "epoch": 3.8480272771553823, "step": 3950}, {"loss": 1.4435, "grad_norm": 0.7312499284744263, "learning_rate": 0.0002, "epoch": 3.857769118363371, "step": 3960}, {"loss": 1.3886, "grad_norm": 0.6858801245689392, "learning_rate": 0.0002, "epoch": 3.867510959571359, "step": 3970}, {"loss": 1.4423, "grad_norm": 0.770423173904419, "learning_rate": 0.0002, "epoch": 3.877252800779347, "step": 3980}, {"loss": 1.5029, "grad_norm": 0.6987539529800415, "learning_rate": 0.0002, "epoch": 3.886994641987336, "step": 3990}, {"loss": 1.4791, "grad_norm": 0.7072722315788269, "learning_rate": 0.0002, "epoch": 3.896736483195324, "step": 4000}, {"loss": 1.528, "grad_norm": 0.6492931842803955, "learning_rate": 0.0002, "epoch": 3.906478324403312, "step": 4010}, {"loss": 1.3824, "grad_norm": 0.7716232538223267, "learning_rate": 0.0002, "epoch": 3.9162201656113007, "step": 4020}, {"loss": 1.4758, "grad_norm": 0.722949743270874, "learning_rate": 0.0002, "epoch": 3.925962006819289, "step": 4030}, {"loss": 1.3914, "grad_norm": 0.7434365749359131, "learning_rate": 0.0002, "epoch": 3.935703848027277, "step": 4040}, {"loss": 1.4763, "grad_norm": 0.6691509485244751, "learning_rate": 0.0002, "epoch": 3.9454456892352656, "step": 4050}, {"loss": 1.4555, "grad_norm": 0.6850284337997437, "learning_rate": 0.0002, "epoch": 3.9551875304432538, "step": 4060}, {"loss": 1.5275, "grad_norm": 0.6954452991485596, "learning_rate": 0.0002, "epoch": 3.964929371651242, "step": 4070}, {"loss": 1.417, "grad_norm": 0.9316364526748657, "learning_rate": 0.0002, "epoch": 3.9746712128592305, "step": 4080}, {"loss": 1.4532, "grad_norm": 0.6908289194107056, "learning_rate": 0.0002, "epoch": 3.9844130540672187, "step": 4090}, {"loss": 1.4404, "grad_norm": 0.666782021522522, "learning_rate": 0.0002, "epoch": 3.994154895275207, "step": 4100}]} +{"epoch": 4.9995129079396, "step": 5132, "epoch_duration": 1650.0333547592163, "total_accumulated_duration": 9224.923346281052, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}, {"eval_loss": 1.8463934659957886, "eval_runtime": 56.2401, "eval_samples_per_second": 9.015, "eval_steps_per_second": 1.138, "epoch": 2.9995129079396006, "step": 3079}, {"loss": 1.5744, "grad_norm": 0.49992576241493225, "learning_rate": 0.0002, "epoch": 3.0004870920603994, "step": 3080}, {"loss": 1.4125, "grad_norm": 0.8242783546447754, "learning_rate": 0.0002, "epoch": 3.0102289332683876, "step": 3090}, {"loss": 1.394, "grad_norm": 0.6330569386482239, "learning_rate": 0.0002, "epoch": 3.019970774476376, "step": 3100}, {"loss": 1.4942, "grad_norm": 0.566097617149353, "learning_rate": 0.0002, "epoch": 3.0297126156843643, "step": 3110}, {"loss": 1.4365, "grad_norm": 0.6337586045265198, "learning_rate": 0.0002, "epoch": 3.0394544568923525, "step": 3120}, {"loss": 1.3916, "grad_norm": 0.7339403033256531, "learning_rate": 0.0002, "epoch": 3.049196298100341, "step": 3130}, {"loss": 1.4617, "grad_norm": 0.7187346816062927, "learning_rate": 0.0002, "epoch": 3.0589381393083293, "step": 3140}, {"loss": 1.3453, "grad_norm": 0.7116255760192871, "learning_rate": 0.0002, "epoch": 3.0686799805163174, "step": 3150}, {"loss": 1.4452, "grad_norm": 0.6493807435035706, "learning_rate": 0.0002, "epoch": 3.078421821724306, "step": 3160}, {"loss": 1.351, "grad_norm": 0.6777266263961792, "learning_rate": 0.0002, "epoch": 3.088163662932294, "step": 3170}, {"loss": 1.4362, "grad_norm": 0.6342006325721741, "learning_rate": 0.0002, "epoch": 3.0979055041402823, "step": 3180}, {"loss": 1.4748, "grad_norm": 0.6608964204788208, "learning_rate": 0.0002, "epoch": 3.107647345348271, "step": 3190}, {"loss": 1.375, "grad_norm": 0.7230247259140015, "learning_rate": 0.0002, "epoch": 3.117389186556259, "step": 3200}, {"loss": 1.4049, "grad_norm": 0.650368332862854, "learning_rate": 0.0002, "epoch": 3.1271310277642472, "step": 3210}, {"loss": 1.409, "grad_norm": 0.7319342494010925, "learning_rate": 0.0002, "epoch": 3.136872868972236, "step": 3220}, {"loss": 1.3872, "grad_norm": 0.7159963846206665, "learning_rate": 0.0002, "epoch": 3.146614710180224, "step": 3230}, {"loss": 1.5076, "grad_norm": 0.8905230164527893, "learning_rate": 0.0002, "epoch": 3.156356551388212, "step": 3240}, {"loss": 1.3161, "grad_norm": 0.6920804381370544, "learning_rate": 0.0002, "epoch": 3.1660983925962007, "step": 3250}, {"loss": 1.3786, "grad_norm": 0.6782063841819763, "learning_rate": 0.0002, "epoch": 3.175840233804189, "step": 3260}, {"loss": 1.5153, "grad_norm": 0.735325276851654, "learning_rate": 0.0002, "epoch": 3.1855820750121775, "step": 3270}, {"loss": 1.4027, "grad_norm": 0.6657978296279907, "learning_rate": 0.0002, "epoch": 3.1953239162201656, "step": 3280}, {"loss": 1.3456, "grad_norm": 0.771315336227417, "learning_rate": 0.0002, "epoch": 3.205065757428154, "step": 3290}, {"loss": 1.3236, "grad_norm": 0.6492983102798462, "learning_rate": 0.0002, "epoch": 3.2148075986361424, "step": 3300}, {"loss": 1.4125, "grad_norm": 0.7513770461082458, "learning_rate": 0.0002, "epoch": 3.2245494398441306, "step": 3310}, {"loss": 1.4032, "grad_norm": 0.7091423869132996, "learning_rate": 0.0002, "epoch": 3.2342912810521187, "step": 3320}, {"loss": 1.4585, "grad_norm": 0.6663975119590759, "learning_rate": 0.0002, "epoch": 3.2440331222601073, "step": 3330}, {"loss": 1.3968, "grad_norm": 0.6813122034072876, "learning_rate": 0.0002, "epoch": 3.2537749634680955, "step": 3340}, {"loss": 1.3681, "grad_norm": 0.6602569818496704, "learning_rate": 0.0002, "epoch": 3.2635168046760836, "step": 3350}, {"loss": 1.4533, "grad_norm": 0.718270480632782, "learning_rate": 0.0002, "epoch": 3.2732586458840722, "step": 3360}, {"loss": 1.4076, "grad_norm": 0.6884173154830933, "learning_rate": 0.0002, "epoch": 3.2830004870920604, "step": 3370}, {"loss": 1.4144, "grad_norm": 0.7039775848388672, "learning_rate": 0.0002, "epoch": 3.2927423283000485, "step": 3380}, {"loss": 1.5077, "grad_norm": 0.7444299459457397, "learning_rate": 0.0002, "epoch": 3.302484169508037, "step": 3390}, {"loss": 1.4255, "grad_norm": 0.7187064290046692, "learning_rate": 0.0002, "epoch": 3.3122260107160253, "step": 3400}, {"loss": 1.3684, "grad_norm": 0.599396288394928, "learning_rate": 0.0002, "epoch": 3.3219678519240134, "step": 3410}, {"loss": 1.4819, "grad_norm": 0.7670390009880066, "learning_rate": 0.0002, "epoch": 3.331709693132002, "step": 3420}, {"loss": 1.4411, "grad_norm": 0.6654478311538696, "learning_rate": 0.0002, "epoch": 3.34145153433999, "step": 3430}, {"loss": 1.4257, "grad_norm": 0.6644385457038879, "learning_rate": 0.0002, "epoch": 3.351193375547979, "step": 3440}, {"loss": 1.4508, "grad_norm": 0.6974098086357117, "learning_rate": 0.0002, "epoch": 3.360935216755967, "step": 3450}, {"loss": 1.3807, "grad_norm": 0.7350399494171143, "learning_rate": 0.0002, "epoch": 3.370677057963955, "step": 3460}, {"loss": 1.4176, "grad_norm": 0.714721143245697, "learning_rate": 0.0002, "epoch": 3.3804188991719437, "step": 3470}, {"loss": 1.4325, "grad_norm": 0.7006027698516846, "learning_rate": 0.0002, "epoch": 3.390160740379932, "step": 3480}, {"loss": 1.4888, "grad_norm": 0.6767925024032593, "learning_rate": 0.0002, "epoch": 3.39990258158792, "step": 3490}, {"loss": 1.4116, "grad_norm": 0.6721355319023132, "learning_rate": 0.0002, "epoch": 3.4096444227959086, "step": 3500}, {"loss": 1.443, "grad_norm": 0.6845725178718567, "learning_rate": 0.0002, "epoch": 3.419386264003897, "step": 3510}, {"loss": 1.4832, "grad_norm": 0.6882196664810181, "learning_rate": 0.0002, "epoch": 3.429128105211885, "step": 3520}, {"loss": 1.4962, "grad_norm": 0.7663240432739258, "learning_rate": 0.0002, "epoch": 3.4388699464198735, "step": 3530}, {"loss": 1.4644, "grad_norm": 0.6304219365119934, "learning_rate": 0.0002, "epoch": 3.4486117876278617, "step": 3540}, {"loss": 1.4918, "grad_norm": 0.668678879737854, "learning_rate": 0.0002, "epoch": 3.45835362883585, "step": 3550}, {"loss": 1.4874, "grad_norm": 0.7526912093162537, "learning_rate": 0.0002, "epoch": 3.4680954700438384, "step": 3560}, {"loss": 1.4249, "grad_norm": 1.089495301246643, "learning_rate": 0.0002, "epoch": 3.4778373112518266, "step": 3570}, {"loss": 1.3871, "grad_norm": 0.7282902002334595, "learning_rate": 0.0002, "epoch": 3.4875791524598148, "step": 3580}, {"loss": 1.5077, "grad_norm": 0.6540156602859497, "learning_rate": 0.0002, "epoch": 3.4973209936678034, "step": 3590}, {"loss": 1.4367, "grad_norm": 0.6449568867683411, "learning_rate": 0.0002, "epoch": 3.5070628348757915, "step": 3600}, {"loss": 1.4532, "grad_norm": 0.7262216210365295, "learning_rate": 0.0002, "epoch": 3.5168046760837797, "step": 3610}, {"loss": 1.4374, "grad_norm": 0.6048615574836731, "learning_rate": 0.0002, "epoch": 3.5265465172917683, "step": 3620}, {"loss": 1.3877, "grad_norm": 0.6780537366867065, "learning_rate": 0.0002, "epoch": 3.5362883584997564, "step": 3630}, {"loss": 1.422, "grad_norm": 0.6851925253868103, "learning_rate": 0.0002, "epoch": 3.5460301997077446, "step": 3640}, {"loss": 1.3425, "grad_norm": 0.6530634164810181, "learning_rate": 0.0002, "epoch": 3.555772040915733, "step": 3650}, {"loss": 1.4879, "grad_norm": 0.7193992733955383, "learning_rate": 0.0002, "epoch": 3.5655138821237213, "step": 3660}, {"loss": 1.4555, "grad_norm": 0.767496645450592, "learning_rate": 0.0002, "epoch": 3.5752557233317095, "step": 3670}, {"loss": 1.4824, "grad_norm": 0.6912919282913208, "learning_rate": 0.0002, "epoch": 3.584997564539698, "step": 3680}, {"loss": 1.4497, "grad_norm": 0.7383436560630798, "learning_rate": 0.0002, "epoch": 3.5947394057476862, "step": 3690}, {"loss": 1.4822, "grad_norm": 0.6746662855148315, "learning_rate": 0.0002, "epoch": 3.6044812469556744, "step": 3700}, {"loss": 1.4904, "grad_norm": 0.6885138750076294, "learning_rate": 0.0002, "epoch": 3.614223088163663, "step": 3710}, {"loss": 1.4044, "grad_norm": 0.6694392561912537, "learning_rate": 0.0002, "epoch": 3.623964929371651, "step": 3720}, {"loss": 1.3719, "grad_norm": 0.812358021736145, "learning_rate": 0.0002, "epoch": 3.6337067705796393, "step": 3730}, {"loss": 1.4603, "grad_norm": 0.7267130017280579, "learning_rate": 0.0002, "epoch": 3.643448611787628, "step": 3740}, {"loss": 1.4574, "grad_norm": 0.6958749294281006, "learning_rate": 0.0002, "epoch": 3.653190452995616, "step": 3750}, {"loss": 1.4346, "grad_norm": 0.6805673241615295, "learning_rate": 0.0002, "epoch": 3.6629322942036042, "step": 3760}, {"loss": 1.4338, "grad_norm": 0.7184410095214844, "learning_rate": 0.0002, "epoch": 3.672674135411593, "step": 3770}, {"loss": 1.3935, "grad_norm": 0.7716330289840698, "learning_rate": 0.0002, "epoch": 3.682415976619581, "step": 3780}, {"loss": 1.384, "grad_norm": 0.6675831079483032, "learning_rate": 0.0002, "epoch": 3.6921578178275696, "step": 3790}, {"loss": 1.401, "grad_norm": 0.6480095386505127, "learning_rate": 0.0002, "epoch": 3.7018996590355577, "step": 3800}, {"loss": 1.5303, "grad_norm": 0.6559418439865112, "learning_rate": 0.0002, "epoch": 3.711641500243546, "step": 3810}, {"loss": 1.4341, "grad_norm": 0.6596545577049255, "learning_rate": 0.0002, "epoch": 3.7213833414515345, "step": 3820}, {"loss": 1.4508, "grad_norm": 0.7172950506210327, "learning_rate": 0.0002, "epoch": 3.7311251826595226, "step": 3830}, {"loss": 1.446, "grad_norm": 0.796148419380188, "learning_rate": 0.0002, "epoch": 3.740867023867511, "step": 3840}, {"loss": 1.4992, "grad_norm": 0.6600322723388672, "learning_rate": 0.0002, "epoch": 3.7506088650754994, "step": 3850}, {"loss": 1.4201, "grad_norm": 0.6776387691497803, "learning_rate": 0.0002, "epoch": 3.7603507062834876, "step": 3860}, {"loss": 1.3893, "grad_norm": 0.7768304347991943, "learning_rate": 0.0002, "epoch": 3.770092547491476, "step": 3870}, {"loss": 1.4886, "grad_norm": 1.0579794645309448, "learning_rate": 0.0002, "epoch": 3.7798343886994643, "step": 3880}, {"loss": 1.4556, "grad_norm": 0.6757252812385559, "learning_rate": 0.0002, "epoch": 3.7895762299074525, "step": 3890}, {"loss": 1.4647, "grad_norm": 0.6706996560096741, "learning_rate": 0.0002, "epoch": 3.799318071115441, "step": 3900}, {"loss": 1.4104, "grad_norm": 0.7026948928833008, "learning_rate": 0.0002, "epoch": 3.809059912323429, "step": 3910}, {"loss": 1.5487, "grad_norm": 0.6437768340110779, "learning_rate": 0.0002, "epoch": 3.8188017535314174, "step": 3920}, {"loss": 1.4678, "grad_norm": 0.7015706300735474, "learning_rate": 0.0002, "epoch": 3.828543594739406, "step": 3930}, {"loss": 1.4891, "grad_norm": 0.7049482464790344, "learning_rate": 0.0002, "epoch": 3.838285435947394, "step": 3940}, {"loss": 1.4208, "grad_norm": 0.6533724665641785, "learning_rate": 0.0002, "epoch": 3.8480272771553823, "step": 3950}, {"loss": 1.4435, "grad_norm": 0.7312499284744263, "learning_rate": 0.0002, "epoch": 3.857769118363371, "step": 3960}, {"loss": 1.3886, "grad_norm": 0.6858801245689392, "learning_rate": 0.0002, "epoch": 3.867510959571359, "step": 3970}, {"loss": 1.4423, "grad_norm": 0.770423173904419, "learning_rate": 0.0002, "epoch": 3.877252800779347, "step": 3980}, {"loss": 1.5029, "grad_norm": 0.6987539529800415, "learning_rate": 0.0002, "epoch": 3.886994641987336, "step": 3990}, {"loss": 1.4791, "grad_norm": 0.7072722315788269, "learning_rate": 0.0002, "epoch": 3.896736483195324, "step": 4000}, {"loss": 1.528, "grad_norm": 0.6492931842803955, "learning_rate": 0.0002, "epoch": 3.906478324403312, "step": 4010}, {"loss": 1.3824, "grad_norm": 0.7716232538223267, "learning_rate": 0.0002, "epoch": 3.9162201656113007, "step": 4020}, {"loss": 1.4758, "grad_norm": 0.722949743270874, "learning_rate": 0.0002, "epoch": 3.925962006819289, "step": 4030}, {"loss": 1.3914, "grad_norm": 0.7434365749359131, "learning_rate": 0.0002, "epoch": 3.935703848027277, "step": 4040}, {"loss": 1.4763, "grad_norm": 0.6691509485244751, "learning_rate": 0.0002, "epoch": 3.9454456892352656, "step": 4050}, {"loss": 1.4555, "grad_norm": 0.6850284337997437, "learning_rate": 0.0002, "epoch": 3.9551875304432538, "step": 4060}, {"loss": 1.5275, "grad_norm": 0.6954452991485596, "learning_rate": 0.0002, "epoch": 3.964929371651242, "step": 4070}, {"loss": 1.417, "grad_norm": 0.9316364526748657, "learning_rate": 0.0002, "epoch": 3.9746712128592305, "step": 4080}, {"loss": 1.4532, "grad_norm": 0.6908289194107056, "learning_rate": 0.0002, "epoch": 3.9844130540672187, "step": 4090}, {"loss": 1.4404, "grad_norm": 0.666782021522522, "learning_rate": 0.0002, "epoch": 3.994154895275207, "step": 4100}, {"eval_loss": 1.9233275651931763, "eval_runtime": 55.9536, "eval_samples_per_second": 9.061, "eval_steps_per_second": 1.144, "epoch": 4.0, "step": 4106}, {"loss": 1.3489, "grad_norm": 0.7726166248321533, "learning_rate": 0.0002, "epoch": 4.003896736483195, "step": 4110}, {"loss": 1.1415, "grad_norm": 1.1338967084884644, "learning_rate": 0.0002, "epoch": 4.013638577691184, "step": 4120}, {"loss": 1.2212, "grad_norm": 0.9530029296875, "learning_rate": 0.0002, "epoch": 4.023380418899172, "step": 4130}, {"loss": 1.2002, "grad_norm": 1.1058554649353027, "learning_rate": 0.0002, "epoch": 4.03312226010716, "step": 4140}, {"loss": 1.2381, "grad_norm": 0.8765049576759338, "learning_rate": 0.0002, "epoch": 4.042864101315149, "step": 4150}, {"loss": 1.2708, "grad_norm": 1.1774667501449585, "learning_rate": 0.0002, "epoch": 4.052605942523137, "step": 4160}, {"loss": 1.2116, "grad_norm": 0.9301433563232422, "learning_rate": 0.0002, "epoch": 4.062347783731125, "step": 4170}, {"loss": 1.1807, "grad_norm": 1.0196778774261475, "learning_rate": 0.0002, "epoch": 4.072089624939114, "step": 4180}, {"loss": 1.2602, "grad_norm": 1.1380577087402344, "learning_rate": 0.0002, "epoch": 4.081831466147102, "step": 4190}, {"loss": 1.2521, "grad_norm": 0.9121319651603699, "learning_rate": 0.0002, "epoch": 4.09157330735509, "step": 4200}, {"loss": 1.1747, "grad_norm": 0.9495378732681274, "learning_rate": 0.0002, "epoch": 4.101315148563079, "step": 4210}, {"loss": 1.1829, "grad_norm": 0.8058680295944214, "learning_rate": 0.0002, "epoch": 4.1110569897710665, "step": 4220}, {"loss": 1.1732, "grad_norm": 1.000887393951416, "learning_rate": 0.0002, "epoch": 4.120798830979055, "step": 4230}, {"loss": 1.1947, "grad_norm": 0.9529102444648743, "learning_rate": 0.0002, "epoch": 4.130540672187044, "step": 4240}, {"loss": 1.2104, "grad_norm": 1.0257115364074707, "learning_rate": 0.0002, "epoch": 4.140282513395031, "step": 4250}, {"loss": 1.2293, "grad_norm": 0.9590303897857666, "learning_rate": 0.0002, "epoch": 4.15002435460302, "step": 4260}, {"loss": 1.1918, "grad_norm": 1.065291166305542, "learning_rate": 0.0002, "epoch": 4.159766195811009, "step": 4270}, {"loss": 1.2323, "grad_norm": 0.8819697499275208, "learning_rate": 0.0002, "epoch": 4.169508037018996, "step": 4280}, {"loss": 1.2167, "grad_norm": 1.0335261821746826, "learning_rate": 0.0002, "epoch": 4.179249878226985, "step": 4290}, {"loss": 1.2131, "grad_norm": 0.8872809410095215, "learning_rate": 0.0002, "epoch": 4.1889917194349735, "step": 4300}, {"loss": 1.2794, "grad_norm": 0.9883159399032593, "learning_rate": 0.0002, "epoch": 4.198733560642961, "step": 4310}, {"loss": 1.2544, "grad_norm": 1.0254192352294922, "learning_rate": 0.0002, "epoch": 4.20847540185095, "step": 4320}, {"loss": 1.2595, "grad_norm": 0.9432600736618042, "learning_rate": 0.0002, "epoch": 4.218217243058938, "step": 4330}, {"loss": 1.2684, "grad_norm": 1.1008676290512085, "learning_rate": 0.0002, "epoch": 4.227959084266926, "step": 4340}, {"loss": 1.2149, "grad_norm": 1.0829699039459229, "learning_rate": 0.0002, "epoch": 4.237700925474915, "step": 4350}, {"loss": 1.2621, "grad_norm": 1.016847848892212, "learning_rate": 0.0002, "epoch": 4.247442766682903, "step": 4360}, {"loss": 1.2375, "grad_norm": 0.8924864530563354, "learning_rate": 0.0002, "epoch": 4.257184607890891, "step": 4370}, {"loss": 1.1987, "grad_norm": 0.9300530552864075, "learning_rate": 0.0002, "epoch": 4.26692644909888, "step": 4380}, {"loss": 1.1696, "grad_norm": 0.9684814810752869, "learning_rate": 0.0002, "epoch": 4.276668290306868, "step": 4390}, {"loss": 1.2006, "grad_norm": 0.9916250705718994, "learning_rate": 0.0002, "epoch": 4.286410131514856, "step": 4400}, {"loss": 1.2402, "grad_norm": 0.903680145740509, "learning_rate": 0.0002, "epoch": 4.2961519727228445, "step": 4410}, {"loss": 1.2022, "grad_norm": 0.8713505268096924, "learning_rate": 0.0002, "epoch": 4.305893813930833, "step": 4420}, {"loss": 1.1957, "grad_norm": 0.9983905553817749, "learning_rate": 0.0002, "epoch": 4.315635655138821, "step": 4430}, {"loss": 1.2676, "grad_norm": 1.1689040660858154, "learning_rate": 0.0002, "epoch": 4.3253774963468095, "step": 4440}, {"loss": 1.2166, "grad_norm": 0.9316853880882263, "learning_rate": 0.0002, "epoch": 4.335119337554798, "step": 4450}, {"loss": 1.222, "grad_norm": 0.9175887107849121, "learning_rate": 0.0002, "epoch": 4.344861178762786, "step": 4460}, {"loss": 1.2571, "grad_norm": 0.9348906874656677, "learning_rate": 0.0002, "epoch": 4.354603019970774, "step": 4470}, {"loss": 1.2764, "grad_norm": 0.9727016687393188, "learning_rate": 0.0002, "epoch": 4.364344861178763, "step": 4480}, {"loss": 1.2616, "grad_norm": 0.9843429923057556, "learning_rate": 0.0002, "epoch": 4.374086702386751, "step": 4490}, {"loss": 1.2488, "grad_norm": 0.9615852236747742, "learning_rate": 0.0002, "epoch": 4.383828543594739, "step": 4500}, {"loss": 1.1718, "grad_norm": 0.9688583612442017, "learning_rate": 0.0002, "epoch": 4.393570384802728, "step": 4510}, {"loss": 1.2546, "grad_norm": 0.9933668375015259, "learning_rate": 0.0002, "epoch": 4.403312226010716, "step": 4520}, {"loss": 1.2355, "grad_norm": 1.0626686811447144, "learning_rate": 0.0002, "epoch": 4.413054067218704, "step": 4530}, {"loss": 1.2425, "grad_norm": 0.9536267518997192, "learning_rate": 0.0002, "epoch": 4.422795908426693, "step": 4540}, {"loss": 1.2562, "grad_norm": 0.9777140021324158, "learning_rate": 0.0002, "epoch": 4.432537749634681, "step": 4550}, {"loss": 1.2878, "grad_norm": 0.980780839920044, "learning_rate": 0.0002, "epoch": 4.442279590842669, "step": 4560}, {"loss": 1.2597, "grad_norm": 1.0147196054458618, "learning_rate": 0.0002, "epoch": 4.452021432050658, "step": 4570}, {"loss": 1.2148, "grad_norm": 0.9763361811637878, "learning_rate": 0.0002, "epoch": 4.461763273258645, "step": 4580}, {"loss": 1.3076, "grad_norm": 1.0300798416137695, "learning_rate": 0.0002, "epoch": 4.471505114466634, "step": 4590}, {"loss": 1.2665, "grad_norm": 0.8833121657371521, "learning_rate": 0.0002, "epoch": 4.481246955674623, "step": 4600}, {"loss": 1.1899, "grad_norm": 1.1214020252227783, "learning_rate": 0.0002, "epoch": 4.490988796882611, "step": 4610}, {"loss": 1.2579, "grad_norm": 0.8843787908554077, "learning_rate": 0.0002, "epoch": 4.500730638090599, "step": 4620}, {"loss": 1.2633, "grad_norm": 0.9942020773887634, "learning_rate": 0.0002, "epoch": 4.5104724792985875, "step": 4630}, {"loss": 1.3172, "grad_norm": 1.0033202171325684, "learning_rate": 0.0002, "epoch": 4.520214320506576, "step": 4640}, {"loss": 1.2024, "grad_norm": 0.8767235279083252, "learning_rate": 0.0002, "epoch": 4.529956161714564, "step": 4650}, {"loss": 1.2714, "grad_norm": 1.0117276906967163, "learning_rate": 0.0002, "epoch": 4.539698002922552, "step": 4660}, {"loss": 1.2911, "grad_norm": 1.2787362337112427, "learning_rate": 0.0002, "epoch": 4.549439844130541, "step": 4670}, {"loss": 1.2603, "grad_norm": 0.8824878931045532, "learning_rate": 0.0002, "epoch": 4.559181685338529, "step": 4680}, {"loss": 1.2905, "grad_norm": 0.9209560751914978, "learning_rate": 0.0002, "epoch": 4.568923526546517, "step": 4690}, {"loss": 1.1916, "grad_norm": 1.1064010858535767, "learning_rate": 0.0002, "epoch": 4.578665367754506, "step": 4700}, {"loss": 1.2217, "grad_norm": 0.8914572596549988, "learning_rate": 0.0002, "epoch": 4.588407208962494, "step": 4710}, {"loss": 1.2861, "grad_norm": 1.0412265062332153, "learning_rate": 0.0002, "epoch": 4.598149050170482, "step": 4720}, {"loss": 1.262, "grad_norm": 1.1950221061706543, "learning_rate": 0.0002, "epoch": 4.607890891378471, "step": 4730}, {"loss": 1.2659, "grad_norm": 0.8938062787055969, "learning_rate": 0.0002, "epoch": 4.617632732586459, "step": 4740}, {"loss": 1.2621, "grad_norm": 0.9849569201469421, "learning_rate": 0.0002, "epoch": 4.627374573794447, "step": 4750}, {"loss": 1.2341, "grad_norm": 1.0081515312194824, "learning_rate": 0.0002, "epoch": 4.637116415002436, "step": 4760}, {"loss": 1.2023, "grad_norm": 0.8566309213638306, "learning_rate": 0.0002, "epoch": 4.6468582562104235, "step": 4770}, {"loss": 1.2723, "grad_norm": 1.1750118732452393, "learning_rate": 0.0002, "epoch": 4.656600097418412, "step": 4780}, {"loss": 1.2537, "grad_norm": 0.925502598285675, "learning_rate": 0.0002, "epoch": 4.666341938626401, "step": 4790}, {"loss": 1.2146, "grad_norm": 1.0402472019195557, "learning_rate": 0.0002, "epoch": 4.676083779834388, "step": 4800}, {"loss": 1.2555, "grad_norm": 0.9772472977638245, "learning_rate": 0.0002, "epoch": 4.685825621042377, "step": 4810}, {"loss": 1.2667, "grad_norm": 0.9082779288291931, "learning_rate": 0.0002, "epoch": 4.695567462250366, "step": 4820}, {"loss": 1.2465, "grad_norm": 0.8026862740516663, "learning_rate": 0.0002, "epoch": 4.705309303458353, "step": 4830}, {"loss": 1.3369, "grad_norm": 1.1631089448928833, "learning_rate": 0.0002, "epoch": 4.715051144666342, "step": 4840}, {"loss": 1.261, "grad_norm": 0.9384787678718567, "learning_rate": 0.0002, "epoch": 4.7247929858743305, "step": 4850}, {"loss": 1.2588, "grad_norm": 1.2151581048965454, "learning_rate": 0.0002, "epoch": 4.734534827082318, "step": 4860}, {"loss": 1.363, "grad_norm": 0.9679436087608337, "learning_rate": 0.0002, "epoch": 4.744276668290307, "step": 4870}, {"loss": 1.3292, "grad_norm": 0.8352158069610596, "learning_rate": 0.0002, "epoch": 4.754018509498295, "step": 4880}, {"loss": 1.3056, "grad_norm": 1.0205804109573364, "learning_rate": 0.0002, "epoch": 4.763760350706283, "step": 4890}, {"loss": 1.223, "grad_norm": 0.9814772605895996, "learning_rate": 0.0002, "epoch": 4.773502191914272, "step": 4900}, {"loss": 1.3114, "grad_norm": 1.002854347229004, "learning_rate": 0.0002, "epoch": 4.78324403312226, "step": 4910}, {"loss": 1.3143, "grad_norm": 1.1609505414962769, "learning_rate": 0.0002, "epoch": 4.792985874330248, "step": 4920}, {"loss": 1.3166, "grad_norm": 0.9354982376098633, "learning_rate": 0.0002, "epoch": 4.802727715538237, "step": 4930}, {"loss": 1.2978, "grad_norm": 0.9761685729026794, "learning_rate": 0.0002, "epoch": 4.812469556746225, "step": 4940}, {"loss": 1.2709, "grad_norm": 1.0604596138000488, "learning_rate": 0.0002, "epoch": 4.822211397954213, "step": 4950}, {"loss": 1.2765, "grad_norm": 1.0902808904647827, "learning_rate": 0.0002, "epoch": 4.8319532391622015, "step": 4960}, {"loss": 1.3073, "grad_norm": 1.0174955129623413, "learning_rate": 0.0002, "epoch": 4.84169508037019, "step": 4970}, {"loss": 1.3141, "grad_norm": 1.0995253324508667, "learning_rate": 0.0002, "epoch": 4.851436921578179, "step": 4980}, {"loss": 1.3006, "grad_norm": 0.880993127822876, "learning_rate": 0.0002, "epoch": 4.8611787627861665, "step": 4990}, {"loss": 1.2547, "grad_norm": 0.9472237825393677, "learning_rate": 0.0002, "epoch": 4.870920603994155, "step": 5000}, {"loss": 1.4078, "grad_norm": 0.9504236578941345, "learning_rate": 0.0002, "epoch": 4.880662445202143, "step": 5010}, {"loss": 1.2791, "grad_norm": 1.1261742115020752, "learning_rate": 0.0002, "epoch": 4.890404286410131, "step": 5020}, {"loss": 1.3707, "grad_norm": 0.904674768447876, "learning_rate": 0.0002, "epoch": 4.90014612761812, "step": 5030}, {"loss": 1.2762, "grad_norm": 0.8828991055488586, "learning_rate": 0.0002, "epoch": 4.909887968826109, "step": 5040}, {"loss": 1.2905, "grad_norm": 1.0156532526016235, "learning_rate": 0.0002, "epoch": 4.919629810034096, "step": 5050}, {"loss": 1.3079, "grad_norm": 0.8975168466567993, "learning_rate": 0.0002, "epoch": 4.929371651242085, "step": 5060}, {"loss": 1.3322, "grad_norm": 0.9787213802337646, "learning_rate": 0.0002, "epoch": 4.939113492450073, "step": 5070}, {"loss": 1.2533, "grad_norm": 1.0801568031311035, "learning_rate": 0.0002, "epoch": 4.948855333658061, "step": 5080}, {"loss": 1.238, "grad_norm": 1.0655089616775513, "learning_rate": 0.0002, "epoch": 4.95859717486605, "step": 5090}, {"loss": 1.2449, "grad_norm": 0.8941320180892944, "learning_rate": 0.0002, "epoch": 4.968339016074038, "step": 5100}, {"loss": 1.2846, "grad_norm": 1.050621747970581, "learning_rate": 0.0002, "epoch": 4.978080857282026, "step": 5110}, {"loss": 1.3791, "grad_norm": 0.9724781513214111, "learning_rate": 0.0002, "epoch": 4.987822698490015, "step": 5120}, {"loss": 1.292, "grad_norm": 0.9850538969039917, "learning_rate": 0.0002, "epoch": 4.997564539698003, "step": 5130}]} +{"epoch": 6.0, "step": 6159, "epoch_duration": 1656.5674257278442, "total_accumulated_duration": 10881.490772008896, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}, {"eval_loss": 1.8463934659957886, "eval_runtime": 56.2401, "eval_samples_per_second": 9.015, "eval_steps_per_second": 1.138, "epoch": 2.9995129079396006, "step": 3079}, {"loss": 1.5744, "grad_norm": 0.49992576241493225, "learning_rate": 0.0002, "epoch": 3.0004870920603994, "step": 3080}, {"loss": 1.4125, "grad_norm": 0.8242783546447754, "learning_rate": 0.0002, "epoch": 3.0102289332683876, "step": 3090}, {"loss": 1.394, "grad_norm": 0.6330569386482239, "learning_rate": 0.0002, "epoch": 3.019970774476376, "step": 3100}, {"loss": 1.4942, "grad_norm": 0.566097617149353, "learning_rate": 0.0002, "epoch": 3.0297126156843643, "step": 3110}, {"loss": 1.4365, "grad_norm": 0.6337586045265198, "learning_rate": 0.0002, "epoch": 3.0394544568923525, "step": 3120}, {"loss": 1.3916, "grad_norm": 0.7339403033256531, "learning_rate": 0.0002, "epoch": 3.049196298100341, "step": 3130}, {"loss": 1.4617, "grad_norm": 0.7187346816062927, "learning_rate": 0.0002, "epoch": 3.0589381393083293, "step": 3140}, {"loss": 1.3453, "grad_norm": 0.7116255760192871, "learning_rate": 0.0002, "epoch": 3.0686799805163174, "step": 3150}, {"loss": 1.4452, "grad_norm": 0.6493807435035706, "learning_rate": 0.0002, "epoch": 3.078421821724306, "step": 3160}, {"loss": 1.351, "grad_norm": 0.6777266263961792, "learning_rate": 0.0002, "epoch": 3.088163662932294, "step": 3170}, {"loss": 1.4362, "grad_norm": 0.6342006325721741, "learning_rate": 0.0002, "epoch": 3.0979055041402823, "step": 3180}, {"loss": 1.4748, "grad_norm": 0.6608964204788208, "learning_rate": 0.0002, "epoch": 3.107647345348271, "step": 3190}, {"loss": 1.375, "grad_norm": 0.7230247259140015, "learning_rate": 0.0002, "epoch": 3.117389186556259, "step": 3200}, {"loss": 1.4049, "grad_norm": 0.650368332862854, "learning_rate": 0.0002, "epoch": 3.1271310277642472, "step": 3210}, {"loss": 1.409, "grad_norm": 0.7319342494010925, "learning_rate": 0.0002, "epoch": 3.136872868972236, "step": 3220}, {"loss": 1.3872, "grad_norm": 0.7159963846206665, "learning_rate": 0.0002, "epoch": 3.146614710180224, "step": 3230}, {"loss": 1.5076, "grad_norm": 0.8905230164527893, "learning_rate": 0.0002, "epoch": 3.156356551388212, "step": 3240}, {"loss": 1.3161, "grad_norm": 0.6920804381370544, "learning_rate": 0.0002, "epoch": 3.1660983925962007, "step": 3250}, {"loss": 1.3786, "grad_norm": 0.6782063841819763, "learning_rate": 0.0002, "epoch": 3.175840233804189, "step": 3260}, {"loss": 1.5153, "grad_norm": 0.735325276851654, "learning_rate": 0.0002, "epoch": 3.1855820750121775, "step": 3270}, {"loss": 1.4027, "grad_norm": 0.6657978296279907, "learning_rate": 0.0002, "epoch": 3.1953239162201656, "step": 3280}, {"loss": 1.3456, "grad_norm": 0.771315336227417, "learning_rate": 0.0002, "epoch": 3.205065757428154, "step": 3290}, {"loss": 1.3236, "grad_norm": 0.6492983102798462, "learning_rate": 0.0002, "epoch": 3.2148075986361424, "step": 3300}, {"loss": 1.4125, "grad_norm": 0.7513770461082458, "learning_rate": 0.0002, "epoch": 3.2245494398441306, "step": 3310}, {"loss": 1.4032, "grad_norm": 0.7091423869132996, "learning_rate": 0.0002, "epoch": 3.2342912810521187, "step": 3320}, {"loss": 1.4585, "grad_norm": 0.6663975119590759, "learning_rate": 0.0002, "epoch": 3.2440331222601073, "step": 3330}, {"loss": 1.3968, "grad_norm": 0.6813122034072876, "learning_rate": 0.0002, "epoch": 3.2537749634680955, "step": 3340}, {"loss": 1.3681, "grad_norm": 0.6602569818496704, "learning_rate": 0.0002, "epoch": 3.2635168046760836, "step": 3350}, {"loss": 1.4533, "grad_norm": 0.718270480632782, "learning_rate": 0.0002, "epoch": 3.2732586458840722, "step": 3360}, {"loss": 1.4076, "grad_norm": 0.6884173154830933, "learning_rate": 0.0002, "epoch": 3.2830004870920604, "step": 3370}, {"loss": 1.4144, "grad_norm": 0.7039775848388672, "learning_rate": 0.0002, "epoch": 3.2927423283000485, "step": 3380}, {"loss": 1.5077, "grad_norm": 0.7444299459457397, "learning_rate": 0.0002, "epoch": 3.302484169508037, "step": 3390}, {"loss": 1.4255, "grad_norm": 0.7187064290046692, "learning_rate": 0.0002, "epoch": 3.3122260107160253, "step": 3400}, {"loss": 1.3684, "grad_norm": 0.599396288394928, "learning_rate": 0.0002, "epoch": 3.3219678519240134, "step": 3410}, {"loss": 1.4819, "grad_norm": 0.7670390009880066, "learning_rate": 0.0002, "epoch": 3.331709693132002, "step": 3420}, {"loss": 1.4411, "grad_norm": 0.6654478311538696, "learning_rate": 0.0002, "epoch": 3.34145153433999, "step": 3430}, {"loss": 1.4257, "grad_norm": 0.6644385457038879, "learning_rate": 0.0002, "epoch": 3.351193375547979, "step": 3440}, {"loss": 1.4508, "grad_norm": 0.6974098086357117, "learning_rate": 0.0002, "epoch": 3.360935216755967, "step": 3450}, {"loss": 1.3807, "grad_norm": 0.7350399494171143, "learning_rate": 0.0002, "epoch": 3.370677057963955, "step": 3460}, {"loss": 1.4176, "grad_norm": 0.714721143245697, "learning_rate": 0.0002, "epoch": 3.3804188991719437, "step": 3470}, {"loss": 1.4325, "grad_norm": 0.7006027698516846, "learning_rate": 0.0002, "epoch": 3.390160740379932, "step": 3480}, {"loss": 1.4888, "grad_norm": 0.6767925024032593, "learning_rate": 0.0002, "epoch": 3.39990258158792, "step": 3490}, {"loss": 1.4116, "grad_norm": 0.6721355319023132, "learning_rate": 0.0002, "epoch": 3.4096444227959086, "step": 3500}, {"loss": 1.443, "grad_norm": 0.6845725178718567, "learning_rate": 0.0002, "epoch": 3.419386264003897, "step": 3510}, {"loss": 1.4832, "grad_norm": 0.6882196664810181, "learning_rate": 0.0002, "epoch": 3.429128105211885, "step": 3520}, {"loss": 1.4962, "grad_norm": 0.7663240432739258, "learning_rate": 0.0002, "epoch": 3.4388699464198735, "step": 3530}, {"loss": 1.4644, "grad_norm": 0.6304219365119934, "learning_rate": 0.0002, "epoch": 3.4486117876278617, "step": 3540}, {"loss": 1.4918, "grad_norm": 0.668678879737854, "learning_rate": 0.0002, "epoch": 3.45835362883585, "step": 3550}, {"loss": 1.4874, "grad_norm": 0.7526912093162537, "learning_rate": 0.0002, "epoch": 3.4680954700438384, "step": 3560}, {"loss": 1.4249, "grad_norm": 1.089495301246643, "learning_rate": 0.0002, "epoch": 3.4778373112518266, "step": 3570}, {"loss": 1.3871, "grad_norm": 0.7282902002334595, "learning_rate": 0.0002, "epoch": 3.4875791524598148, "step": 3580}, {"loss": 1.5077, "grad_norm": 0.6540156602859497, "learning_rate": 0.0002, "epoch": 3.4973209936678034, "step": 3590}, {"loss": 1.4367, "grad_norm": 0.6449568867683411, "learning_rate": 0.0002, "epoch": 3.5070628348757915, "step": 3600}, {"loss": 1.4532, "grad_norm": 0.7262216210365295, "learning_rate": 0.0002, "epoch": 3.5168046760837797, "step": 3610}, {"loss": 1.4374, "grad_norm": 0.6048615574836731, "learning_rate": 0.0002, "epoch": 3.5265465172917683, "step": 3620}, {"loss": 1.3877, "grad_norm": 0.6780537366867065, "learning_rate": 0.0002, "epoch": 3.5362883584997564, "step": 3630}, {"loss": 1.422, "grad_norm": 0.6851925253868103, "learning_rate": 0.0002, "epoch": 3.5460301997077446, "step": 3640}, {"loss": 1.3425, "grad_norm": 0.6530634164810181, "learning_rate": 0.0002, "epoch": 3.555772040915733, "step": 3650}, {"loss": 1.4879, "grad_norm": 0.7193992733955383, "learning_rate": 0.0002, "epoch": 3.5655138821237213, "step": 3660}, {"loss": 1.4555, "grad_norm": 0.767496645450592, "learning_rate": 0.0002, "epoch": 3.5752557233317095, "step": 3670}, {"loss": 1.4824, "grad_norm": 0.6912919282913208, "learning_rate": 0.0002, "epoch": 3.584997564539698, "step": 3680}, {"loss": 1.4497, "grad_norm": 0.7383436560630798, "learning_rate": 0.0002, "epoch": 3.5947394057476862, "step": 3690}, {"loss": 1.4822, "grad_norm": 0.6746662855148315, "learning_rate": 0.0002, "epoch": 3.6044812469556744, "step": 3700}, {"loss": 1.4904, "grad_norm": 0.6885138750076294, "learning_rate": 0.0002, "epoch": 3.614223088163663, "step": 3710}, {"loss": 1.4044, "grad_norm": 0.6694392561912537, "learning_rate": 0.0002, "epoch": 3.623964929371651, "step": 3720}, {"loss": 1.3719, "grad_norm": 0.812358021736145, "learning_rate": 0.0002, "epoch": 3.6337067705796393, "step": 3730}, {"loss": 1.4603, "grad_norm": 0.7267130017280579, "learning_rate": 0.0002, "epoch": 3.643448611787628, "step": 3740}, {"loss": 1.4574, "grad_norm": 0.6958749294281006, "learning_rate": 0.0002, "epoch": 3.653190452995616, "step": 3750}, {"loss": 1.4346, "grad_norm": 0.6805673241615295, "learning_rate": 0.0002, "epoch": 3.6629322942036042, "step": 3760}, {"loss": 1.4338, "grad_norm": 0.7184410095214844, "learning_rate": 0.0002, "epoch": 3.672674135411593, "step": 3770}, {"loss": 1.3935, "grad_norm": 0.7716330289840698, "learning_rate": 0.0002, "epoch": 3.682415976619581, "step": 3780}, {"loss": 1.384, "grad_norm": 0.6675831079483032, "learning_rate": 0.0002, "epoch": 3.6921578178275696, "step": 3790}, {"loss": 1.401, "grad_norm": 0.6480095386505127, "learning_rate": 0.0002, "epoch": 3.7018996590355577, "step": 3800}, {"loss": 1.5303, "grad_norm": 0.6559418439865112, "learning_rate": 0.0002, "epoch": 3.711641500243546, "step": 3810}, {"loss": 1.4341, "grad_norm": 0.6596545577049255, "learning_rate": 0.0002, "epoch": 3.7213833414515345, "step": 3820}, {"loss": 1.4508, "grad_norm": 0.7172950506210327, "learning_rate": 0.0002, "epoch": 3.7311251826595226, "step": 3830}, {"loss": 1.446, "grad_norm": 0.796148419380188, "learning_rate": 0.0002, "epoch": 3.740867023867511, "step": 3840}, {"loss": 1.4992, "grad_norm": 0.6600322723388672, "learning_rate": 0.0002, "epoch": 3.7506088650754994, "step": 3850}, {"loss": 1.4201, "grad_norm": 0.6776387691497803, "learning_rate": 0.0002, "epoch": 3.7603507062834876, "step": 3860}, {"loss": 1.3893, "grad_norm": 0.7768304347991943, "learning_rate": 0.0002, "epoch": 3.770092547491476, "step": 3870}, {"loss": 1.4886, "grad_norm": 1.0579794645309448, "learning_rate": 0.0002, "epoch": 3.7798343886994643, "step": 3880}, {"loss": 1.4556, "grad_norm": 0.6757252812385559, "learning_rate": 0.0002, "epoch": 3.7895762299074525, "step": 3890}, {"loss": 1.4647, "grad_norm": 0.6706996560096741, "learning_rate": 0.0002, "epoch": 3.799318071115441, "step": 3900}, {"loss": 1.4104, "grad_norm": 0.7026948928833008, "learning_rate": 0.0002, "epoch": 3.809059912323429, "step": 3910}, {"loss": 1.5487, "grad_norm": 0.6437768340110779, "learning_rate": 0.0002, "epoch": 3.8188017535314174, "step": 3920}, {"loss": 1.4678, "grad_norm": 0.7015706300735474, "learning_rate": 0.0002, "epoch": 3.828543594739406, "step": 3930}, {"loss": 1.4891, "grad_norm": 0.7049482464790344, "learning_rate": 0.0002, "epoch": 3.838285435947394, "step": 3940}, {"loss": 1.4208, "grad_norm": 0.6533724665641785, "learning_rate": 0.0002, "epoch": 3.8480272771553823, "step": 3950}, {"loss": 1.4435, "grad_norm": 0.7312499284744263, "learning_rate": 0.0002, "epoch": 3.857769118363371, "step": 3960}, {"loss": 1.3886, "grad_norm": 0.6858801245689392, "learning_rate": 0.0002, "epoch": 3.867510959571359, "step": 3970}, {"loss": 1.4423, "grad_norm": 0.770423173904419, "learning_rate": 0.0002, "epoch": 3.877252800779347, "step": 3980}, {"loss": 1.5029, "grad_norm": 0.6987539529800415, "learning_rate": 0.0002, "epoch": 3.886994641987336, "step": 3990}, {"loss": 1.4791, "grad_norm": 0.7072722315788269, "learning_rate": 0.0002, "epoch": 3.896736483195324, "step": 4000}, {"loss": 1.528, "grad_norm": 0.6492931842803955, "learning_rate": 0.0002, "epoch": 3.906478324403312, "step": 4010}, {"loss": 1.3824, "grad_norm": 0.7716232538223267, "learning_rate": 0.0002, "epoch": 3.9162201656113007, "step": 4020}, {"loss": 1.4758, "grad_norm": 0.722949743270874, "learning_rate": 0.0002, "epoch": 3.925962006819289, "step": 4030}, {"loss": 1.3914, "grad_norm": 0.7434365749359131, "learning_rate": 0.0002, "epoch": 3.935703848027277, "step": 4040}, {"loss": 1.4763, "grad_norm": 0.6691509485244751, "learning_rate": 0.0002, "epoch": 3.9454456892352656, "step": 4050}, {"loss": 1.4555, "grad_norm": 0.6850284337997437, "learning_rate": 0.0002, "epoch": 3.9551875304432538, "step": 4060}, {"loss": 1.5275, "grad_norm": 0.6954452991485596, "learning_rate": 0.0002, "epoch": 3.964929371651242, "step": 4070}, {"loss": 1.417, "grad_norm": 0.9316364526748657, "learning_rate": 0.0002, "epoch": 3.9746712128592305, "step": 4080}, {"loss": 1.4532, "grad_norm": 0.6908289194107056, "learning_rate": 0.0002, "epoch": 3.9844130540672187, "step": 4090}, {"loss": 1.4404, "grad_norm": 0.666782021522522, "learning_rate": 0.0002, "epoch": 3.994154895275207, "step": 4100}, {"eval_loss": 1.9233275651931763, "eval_runtime": 55.9536, "eval_samples_per_second": 9.061, "eval_steps_per_second": 1.144, "epoch": 4.0, "step": 4106}, {"loss": 1.3489, "grad_norm": 0.7726166248321533, "learning_rate": 0.0002, "epoch": 4.003896736483195, "step": 4110}, {"loss": 1.1415, "grad_norm": 1.1338967084884644, "learning_rate": 0.0002, "epoch": 4.013638577691184, "step": 4120}, {"loss": 1.2212, "grad_norm": 0.9530029296875, "learning_rate": 0.0002, "epoch": 4.023380418899172, "step": 4130}, {"loss": 1.2002, "grad_norm": 1.1058554649353027, "learning_rate": 0.0002, "epoch": 4.03312226010716, "step": 4140}, {"loss": 1.2381, "grad_norm": 0.8765049576759338, "learning_rate": 0.0002, "epoch": 4.042864101315149, "step": 4150}, {"loss": 1.2708, "grad_norm": 1.1774667501449585, "learning_rate": 0.0002, "epoch": 4.052605942523137, "step": 4160}, {"loss": 1.2116, "grad_norm": 0.9301433563232422, "learning_rate": 0.0002, "epoch": 4.062347783731125, "step": 4170}, {"loss": 1.1807, "grad_norm": 1.0196778774261475, "learning_rate": 0.0002, "epoch": 4.072089624939114, "step": 4180}, {"loss": 1.2602, "grad_norm": 1.1380577087402344, "learning_rate": 0.0002, "epoch": 4.081831466147102, "step": 4190}, {"loss": 1.2521, "grad_norm": 0.9121319651603699, "learning_rate": 0.0002, "epoch": 4.09157330735509, "step": 4200}, {"loss": 1.1747, "grad_norm": 0.9495378732681274, "learning_rate": 0.0002, "epoch": 4.101315148563079, "step": 4210}, {"loss": 1.1829, "grad_norm": 0.8058680295944214, "learning_rate": 0.0002, "epoch": 4.1110569897710665, "step": 4220}, {"loss": 1.1732, "grad_norm": 1.000887393951416, "learning_rate": 0.0002, "epoch": 4.120798830979055, "step": 4230}, {"loss": 1.1947, "grad_norm": 0.9529102444648743, "learning_rate": 0.0002, "epoch": 4.130540672187044, "step": 4240}, {"loss": 1.2104, "grad_norm": 1.0257115364074707, "learning_rate": 0.0002, "epoch": 4.140282513395031, "step": 4250}, {"loss": 1.2293, "grad_norm": 0.9590303897857666, "learning_rate": 0.0002, "epoch": 4.15002435460302, "step": 4260}, {"loss": 1.1918, "grad_norm": 1.065291166305542, "learning_rate": 0.0002, "epoch": 4.159766195811009, "step": 4270}, {"loss": 1.2323, "grad_norm": 0.8819697499275208, "learning_rate": 0.0002, "epoch": 4.169508037018996, "step": 4280}, {"loss": 1.2167, "grad_norm": 1.0335261821746826, "learning_rate": 0.0002, "epoch": 4.179249878226985, "step": 4290}, {"loss": 1.2131, "grad_norm": 0.8872809410095215, "learning_rate": 0.0002, "epoch": 4.1889917194349735, "step": 4300}, {"loss": 1.2794, "grad_norm": 0.9883159399032593, "learning_rate": 0.0002, "epoch": 4.198733560642961, "step": 4310}, {"loss": 1.2544, "grad_norm": 1.0254192352294922, "learning_rate": 0.0002, "epoch": 4.20847540185095, "step": 4320}, {"loss": 1.2595, "grad_norm": 0.9432600736618042, "learning_rate": 0.0002, "epoch": 4.218217243058938, "step": 4330}, {"loss": 1.2684, "grad_norm": 1.1008676290512085, "learning_rate": 0.0002, "epoch": 4.227959084266926, "step": 4340}, {"loss": 1.2149, "grad_norm": 1.0829699039459229, "learning_rate": 0.0002, "epoch": 4.237700925474915, "step": 4350}, {"loss": 1.2621, "grad_norm": 1.016847848892212, "learning_rate": 0.0002, "epoch": 4.247442766682903, "step": 4360}, {"loss": 1.2375, "grad_norm": 0.8924864530563354, "learning_rate": 0.0002, "epoch": 4.257184607890891, "step": 4370}, {"loss": 1.1987, "grad_norm": 0.9300530552864075, "learning_rate": 0.0002, "epoch": 4.26692644909888, "step": 4380}, {"loss": 1.1696, "grad_norm": 0.9684814810752869, "learning_rate": 0.0002, "epoch": 4.276668290306868, "step": 4390}, {"loss": 1.2006, "grad_norm": 0.9916250705718994, "learning_rate": 0.0002, "epoch": 4.286410131514856, "step": 4400}, {"loss": 1.2402, "grad_norm": 0.903680145740509, "learning_rate": 0.0002, "epoch": 4.2961519727228445, "step": 4410}, {"loss": 1.2022, "grad_norm": 0.8713505268096924, "learning_rate": 0.0002, "epoch": 4.305893813930833, "step": 4420}, {"loss": 1.1957, "grad_norm": 0.9983905553817749, "learning_rate": 0.0002, "epoch": 4.315635655138821, "step": 4430}, {"loss": 1.2676, "grad_norm": 1.1689040660858154, "learning_rate": 0.0002, "epoch": 4.3253774963468095, "step": 4440}, {"loss": 1.2166, "grad_norm": 0.9316853880882263, "learning_rate": 0.0002, "epoch": 4.335119337554798, "step": 4450}, {"loss": 1.222, "grad_norm": 0.9175887107849121, "learning_rate": 0.0002, "epoch": 4.344861178762786, "step": 4460}, {"loss": 1.2571, "grad_norm": 0.9348906874656677, "learning_rate": 0.0002, "epoch": 4.354603019970774, "step": 4470}, {"loss": 1.2764, "grad_norm": 0.9727016687393188, "learning_rate": 0.0002, "epoch": 4.364344861178763, "step": 4480}, {"loss": 1.2616, "grad_norm": 0.9843429923057556, "learning_rate": 0.0002, "epoch": 4.374086702386751, "step": 4490}, {"loss": 1.2488, "grad_norm": 0.9615852236747742, "learning_rate": 0.0002, "epoch": 4.383828543594739, "step": 4500}, {"loss": 1.1718, "grad_norm": 0.9688583612442017, "learning_rate": 0.0002, "epoch": 4.393570384802728, "step": 4510}, {"loss": 1.2546, "grad_norm": 0.9933668375015259, "learning_rate": 0.0002, "epoch": 4.403312226010716, "step": 4520}, {"loss": 1.2355, "grad_norm": 1.0626686811447144, "learning_rate": 0.0002, "epoch": 4.413054067218704, "step": 4530}, {"loss": 1.2425, "grad_norm": 0.9536267518997192, "learning_rate": 0.0002, "epoch": 4.422795908426693, "step": 4540}, {"loss": 1.2562, "grad_norm": 0.9777140021324158, "learning_rate": 0.0002, "epoch": 4.432537749634681, "step": 4550}, {"loss": 1.2878, "grad_norm": 0.980780839920044, "learning_rate": 0.0002, "epoch": 4.442279590842669, "step": 4560}, {"loss": 1.2597, "grad_norm": 1.0147196054458618, "learning_rate": 0.0002, "epoch": 4.452021432050658, "step": 4570}, {"loss": 1.2148, "grad_norm": 0.9763361811637878, "learning_rate": 0.0002, "epoch": 4.461763273258645, "step": 4580}, {"loss": 1.3076, "grad_norm": 1.0300798416137695, "learning_rate": 0.0002, "epoch": 4.471505114466634, "step": 4590}, {"loss": 1.2665, "grad_norm": 0.8833121657371521, "learning_rate": 0.0002, "epoch": 4.481246955674623, "step": 4600}, {"loss": 1.1899, "grad_norm": 1.1214020252227783, "learning_rate": 0.0002, "epoch": 4.490988796882611, "step": 4610}, {"loss": 1.2579, "grad_norm": 0.8843787908554077, "learning_rate": 0.0002, "epoch": 4.500730638090599, "step": 4620}, {"loss": 1.2633, "grad_norm": 0.9942020773887634, "learning_rate": 0.0002, "epoch": 4.5104724792985875, "step": 4630}, {"loss": 1.3172, "grad_norm": 1.0033202171325684, "learning_rate": 0.0002, "epoch": 4.520214320506576, "step": 4640}, {"loss": 1.2024, "grad_norm": 0.8767235279083252, "learning_rate": 0.0002, "epoch": 4.529956161714564, "step": 4650}, {"loss": 1.2714, "grad_norm": 1.0117276906967163, "learning_rate": 0.0002, "epoch": 4.539698002922552, "step": 4660}, {"loss": 1.2911, "grad_norm": 1.2787362337112427, "learning_rate": 0.0002, "epoch": 4.549439844130541, "step": 4670}, {"loss": 1.2603, "grad_norm": 0.8824878931045532, "learning_rate": 0.0002, "epoch": 4.559181685338529, "step": 4680}, {"loss": 1.2905, "grad_norm": 0.9209560751914978, "learning_rate": 0.0002, "epoch": 4.568923526546517, "step": 4690}, {"loss": 1.1916, "grad_norm": 1.1064010858535767, "learning_rate": 0.0002, "epoch": 4.578665367754506, "step": 4700}, {"loss": 1.2217, "grad_norm": 0.8914572596549988, "learning_rate": 0.0002, "epoch": 4.588407208962494, "step": 4710}, {"loss": 1.2861, "grad_norm": 1.0412265062332153, "learning_rate": 0.0002, "epoch": 4.598149050170482, "step": 4720}, {"loss": 1.262, "grad_norm": 1.1950221061706543, "learning_rate": 0.0002, "epoch": 4.607890891378471, "step": 4730}, {"loss": 1.2659, "grad_norm": 0.8938062787055969, "learning_rate": 0.0002, "epoch": 4.617632732586459, "step": 4740}, {"loss": 1.2621, "grad_norm": 0.9849569201469421, "learning_rate": 0.0002, "epoch": 4.627374573794447, "step": 4750}, {"loss": 1.2341, "grad_norm": 1.0081515312194824, "learning_rate": 0.0002, "epoch": 4.637116415002436, "step": 4760}, {"loss": 1.2023, "grad_norm": 0.8566309213638306, "learning_rate": 0.0002, "epoch": 4.6468582562104235, "step": 4770}, {"loss": 1.2723, "grad_norm": 1.1750118732452393, "learning_rate": 0.0002, "epoch": 4.656600097418412, "step": 4780}, {"loss": 1.2537, "grad_norm": 0.925502598285675, "learning_rate": 0.0002, "epoch": 4.666341938626401, "step": 4790}, {"loss": 1.2146, "grad_norm": 1.0402472019195557, "learning_rate": 0.0002, "epoch": 4.676083779834388, "step": 4800}, {"loss": 1.2555, "grad_norm": 0.9772472977638245, "learning_rate": 0.0002, "epoch": 4.685825621042377, "step": 4810}, {"loss": 1.2667, "grad_norm": 0.9082779288291931, "learning_rate": 0.0002, "epoch": 4.695567462250366, "step": 4820}, {"loss": 1.2465, "grad_norm": 0.8026862740516663, "learning_rate": 0.0002, "epoch": 4.705309303458353, "step": 4830}, {"loss": 1.3369, "grad_norm": 1.1631089448928833, "learning_rate": 0.0002, "epoch": 4.715051144666342, "step": 4840}, {"loss": 1.261, "grad_norm": 0.9384787678718567, "learning_rate": 0.0002, "epoch": 4.7247929858743305, "step": 4850}, {"loss": 1.2588, "grad_norm": 1.2151581048965454, "learning_rate": 0.0002, "epoch": 4.734534827082318, "step": 4860}, {"loss": 1.363, "grad_norm": 0.9679436087608337, "learning_rate": 0.0002, "epoch": 4.744276668290307, "step": 4870}, {"loss": 1.3292, "grad_norm": 0.8352158069610596, "learning_rate": 0.0002, "epoch": 4.754018509498295, "step": 4880}, {"loss": 1.3056, "grad_norm": 1.0205804109573364, "learning_rate": 0.0002, "epoch": 4.763760350706283, "step": 4890}, {"loss": 1.223, "grad_norm": 0.9814772605895996, "learning_rate": 0.0002, "epoch": 4.773502191914272, "step": 4900}, {"loss": 1.3114, "grad_norm": 1.002854347229004, "learning_rate": 0.0002, "epoch": 4.78324403312226, "step": 4910}, {"loss": 1.3143, "grad_norm": 1.1609505414962769, "learning_rate": 0.0002, "epoch": 4.792985874330248, "step": 4920}, {"loss": 1.3166, "grad_norm": 0.9354982376098633, "learning_rate": 0.0002, "epoch": 4.802727715538237, "step": 4930}, {"loss": 1.2978, "grad_norm": 0.9761685729026794, "learning_rate": 0.0002, "epoch": 4.812469556746225, "step": 4940}, {"loss": 1.2709, "grad_norm": 1.0604596138000488, "learning_rate": 0.0002, "epoch": 4.822211397954213, "step": 4950}, {"loss": 1.2765, "grad_norm": 1.0902808904647827, "learning_rate": 0.0002, "epoch": 4.8319532391622015, "step": 4960}, {"loss": 1.3073, "grad_norm": 1.0174955129623413, "learning_rate": 0.0002, "epoch": 4.84169508037019, "step": 4970}, {"loss": 1.3141, "grad_norm": 1.0995253324508667, "learning_rate": 0.0002, "epoch": 4.851436921578179, "step": 4980}, {"loss": 1.3006, "grad_norm": 0.880993127822876, "learning_rate": 0.0002, "epoch": 4.8611787627861665, "step": 4990}, {"loss": 1.2547, "grad_norm": 0.9472237825393677, "learning_rate": 0.0002, "epoch": 4.870920603994155, "step": 5000}, {"loss": 1.4078, "grad_norm": 0.9504236578941345, "learning_rate": 0.0002, "epoch": 4.880662445202143, "step": 5010}, {"loss": 1.2791, "grad_norm": 1.1261742115020752, "learning_rate": 0.0002, "epoch": 4.890404286410131, "step": 5020}, {"loss": 1.3707, "grad_norm": 0.904674768447876, "learning_rate": 0.0002, "epoch": 4.90014612761812, "step": 5030}, {"loss": 1.2762, "grad_norm": 0.8828991055488586, "learning_rate": 0.0002, "epoch": 4.909887968826109, "step": 5040}, {"loss": 1.2905, "grad_norm": 1.0156532526016235, "learning_rate": 0.0002, "epoch": 4.919629810034096, "step": 5050}, {"loss": 1.3079, "grad_norm": 0.8975168466567993, "learning_rate": 0.0002, "epoch": 4.929371651242085, "step": 5060}, {"loss": 1.3322, "grad_norm": 0.9787213802337646, "learning_rate": 0.0002, "epoch": 4.939113492450073, "step": 5070}, {"loss": 1.2533, "grad_norm": 1.0801568031311035, "learning_rate": 0.0002, "epoch": 4.948855333658061, "step": 5080}, {"loss": 1.238, "grad_norm": 1.0655089616775513, "learning_rate": 0.0002, "epoch": 4.95859717486605, "step": 5090}, {"loss": 1.2449, "grad_norm": 0.8941320180892944, "learning_rate": 0.0002, "epoch": 4.968339016074038, "step": 5100}, {"loss": 1.2846, "grad_norm": 1.050621747970581, "learning_rate": 0.0002, "epoch": 4.978080857282026, "step": 5110}, {"loss": 1.3791, "grad_norm": 0.9724781513214111, "learning_rate": 0.0002, "epoch": 4.987822698490015, "step": 5120}, {"loss": 1.292, "grad_norm": 0.9850538969039917, "learning_rate": 0.0002, "epoch": 4.997564539698003, "step": 5130}, {"eval_loss": 2.0824170112609863, "eval_runtime": 55.592, "eval_samples_per_second": 9.12, "eval_steps_per_second": 1.151, "epoch": 4.9995129079396, "step": 5132}, {"loss": 1.037, "grad_norm": 1.0096189975738525, "learning_rate": 0.0002, "epoch": 5.007306380905991, "step": 5140}, {"loss": 1.0003, "grad_norm": 1.2403408288955688, "learning_rate": 0.0002, "epoch": 5.01704822211398, "step": 5150}, {"loss": 1.0129, "grad_norm": 1.1243221759796143, "learning_rate": 0.0002, "epoch": 5.026790063321968, "step": 5160}, {"loss": 0.9815, "grad_norm": 1.4745502471923828, "learning_rate": 0.0002, "epoch": 5.036531904529956, "step": 5170}, {"loss": 0.9715, "grad_norm": 1.1913198232650757, "learning_rate": 0.0002, "epoch": 5.0462737457379445, "step": 5180}, {"loss": 0.9282, "grad_norm": 1.2732855081558228, "learning_rate": 0.0002, "epoch": 5.056015586945933, "step": 5190}, {"loss": 0.9857, "grad_norm": 1.1737396717071533, "learning_rate": 0.0002, "epoch": 5.065757428153921, "step": 5200}, {"loss": 0.9754, "grad_norm": 1.4162768125534058, "learning_rate": 0.0002, "epoch": 5.075499269361909, "step": 5210}, {"loss": 1.0333, "grad_norm": 1.528274655342102, "learning_rate": 0.0002, "epoch": 5.085241110569898, "step": 5220}, {"loss": 1.0227, "grad_norm": 1.3966618776321411, "learning_rate": 0.0002, "epoch": 5.094982951777886, "step": 5230}, {"loss": 0.987, "grad_norm": 1.3427953720092773, "learning_rate": 0.0002, "epoch": 5.104724792985874, "step": 5240}, {"loss": 1.0353, "grad_norm": 1.6533905267715454, "learning_rate": 0.0002, "epoch": 5.114466634193863, "step": 5250}, {"loss": 1.0452, "grad_norm": 1.4114865064620972, "learning_rate": 0.0002, "epoch": 5.124208475401851, "step": 5260}, {"loss": 1.067, "grad_norm": 1.5460708141326904, "learning_rate": 0.0002, "epoch": 5.133950316609839, "step": 5270}, {"loss": 1.0667, "grad_norm": 1.3491919040679932, "learning_rate": 0.0002, "epoch": 5.143692157817828, "step": 5280}, {"loss": 0.9957, "grad_norm": 1.2208969593048096, "learning_rate": 0.0002, "epoch": 5.153433999025816, "step": 5290}, {"loss": 1.0362, "grad_norm": 1.1141403913497925, "learning_rate": 0.0002, "epoch": 5.163175840233804, "step": 5300}, {"loss": 0.9744, "grad_norm": 1.2938064336776733, "learning_rate": 0.0002, "epoch": 5.172917681441793, "step": 5310}, {"loss": 1.0438, "grad_norm": 1.2704918384552002, "learning_rate": 0.0002, "epoch": 5.1826595226497805, "step": 5320}, {"loss": 1.0015, "grad_norm": 1.3928544521331787, "learning_rate": 0.0002, "epoch": 5.192401363857769, "step": 5330}, {"loss": 1.025, "grad_norm": 1.1993824243545532, "learning_rate": 0.0002, "epoch": 5.202143205065758, "step": 5340}, {"loss": 1.0195, "grad_norm": 1.5913670063018799, "learning_rate": 0.0002, "epoch": 5.211885046273745, "step": 5350}, {"loss": 1.0113, "grad_norm": 1.1577855348587036, "learning_rate": 0.0002, "epoch": 5.221626887481734, "step": 5360}, {"loss": 1.0684, "grad_norm": 1.4535993337631226, "learning_rate": 0.0002, "epoch": 5.231368728689723, "step": 5370}, {"loss": 1.0255, "grad_norm": 1.5068976879119873, "learning_rate": 0.0002, "epoch": 5.24111056989771, "step": 5380}, {"loss": 1.0068, "grad_norm": 1.2365459203720093, "learning_rate": 0.0002, "epoch": 5.250852411105699, "step": 5390}, {"loss": 1.0145, "grad_norm": 1.3197922706604004, "learning_rate": 0.0002, "epoch": 5.2605942523136875, "step": 5400}, {"loss": 1.0767, "grad_norm": 1.2395117282867432, "learning_rate": 0.0002, "epoch": 5.270336093521675, "step": 5410}, {"loss": 1.0292, "grad_norm": 1.1841236352920532, "learning_rate": 0.0002, "epoch": 5.280077934729664, "step": 5420}, {"loss": 1.0233, "grad_norm": 1.218003749847412, "learning_rate": 0.0002, "epoch": 5.289819775937652, "step": 5430}, {"loss": 1.0093, "grad_norm": 1.2210947275161743, "learning_rate": 0.0002, "epoch": 5.29956161714564, "step": 5440}, {"loss": 0.9619, "grad_norm": 1.266006588935852, "learning_rate": 0.0002, "epoch": 5.309303458353629, "step": 5450}, {"loss": 1.0352, "grad_norm": 1.2598075866699219, "learning_rate": 0.0002, "epoch": 5.319045299561617, "step": 5460}, {"loss": 1.0929, "grad_norm": 1.2410019636154175, "learning_rate": 0.0002, "epoch": 5.328787140769606, "step": 5470}, {"loss": 1.058, "grad_norm": 1.249698519706726, "learning_rate": 0.0002, "epoch": 5.338528981977594, "step": 5480}, {"loss": 1.0457, "grad_norm": 1.2398173809051514, "learning_rate": 0.0002, "epoch": 5.348270823185582, "step": 5490}, {"loss": 1.0139, "grad_norm": 1.2416654825210571, "learning_rate": 0.0002, "epoch": 5.35801266439357, "step": 5500}, {"loss": 1.0609, "grad_norm": 1.398706316947937, "learning_rate": 0.0002, "epoch": 5.3677545056015585, "step": 5510}, {"loss": 1.0512, "grad_norm": 1.3049418926239014, "learning_rate": 0.0002, "epoch": 5.377496346809547, "step": 5520}, {"loss": 1.0912, "grad_norm": 1.2528893947601318, "learning_rate": 0.0002, "epoch": 5.387238188017536, "step": 5530}, {"loss": 1.0619, "grad_norm": 1.2963255643844604, "learning_rate": 0.0002, "epoch": 5.3969800292255234, "step": 5540}, {"loss": 1.0194, "grad_norm": 1.494231104850769, "learning_rate": 0.0002, "epoch": 5.406721870433512, "step": 5550}, {"loss": 1.0179, "grad_norm": 1.2760992050170898, "learning_rate": 0.0002, "epoch": 5.416463711641501, "step": 5560}, {"loss": 1.1088, "grad_norm": 1.195292592048645, "learning_rate": 0.0002, "epoch": 5.426205552849488, "step": 5570}, {"loss": 1.0859, "grad_norm": 1.6408965587615967, "learning_rate": 0.0002, "epoch": 5.435947394057477, "step": 5580}, {"loss": 1.0868, "grad_norm": 1.3092058897018433, "learning_rate": 0.0002, "epoch": 5.4456892352654656, "step": 5590}, {"loss": 1.006, "grad_norm": 1.2960586547851562, "learning_rate": 0.0002, "epoch": 5.455431076473453, "step": 5600}, {"loss": 1.0257, "grad_norm": 1.3560487031936646, "learning_rate": 0.0002, "epoch": 5.465172917681442, "step": 5610}, {"loss": 1.0314, "grad_norm": 1.1896311044692993, "learning_rate": 0.0002, "epoch": 5.4749147588894305, "step": 5620}, {"loss": 1.0435, "grad_norm": 1.3145595788955688, "learning_rate": 0.0002, "epoch": 5.484656600097418, "step": 5630}, {"loss": 1.0456, "grad_norm": 1.2207404375076294, "learning_rate": 0.0002, "epoch": 5.494398441305407, "step": 5640}, {"loss": 1.0823, "grad_norm": 1.266015887260437, "learning_rate": 0.0002, "epoch": 5.504140282513395, "step": 5650}, {"loss": 1.0696, "grad_norm": 1.2478289604187012, "learning_rate": 0.0002, "epoch": 5.513882123721383, "step": 5660}, {"loss": 1.0695, "grad_norm": 1.4851372241973877, "learning_rate": 0.0002, "epoch": 5.523623964929372, "step": 5670}, {"loss": 1.0736, "grad_norm": 1.4478679895401, "learning_rate": 0.0002, "epoch": 5.53336580613736, "step": 5680}, {"loss": 1.043, "grad_norm": 1.1079537868499756, "learning_rate": 0.0002, "epoch": 5.543107647345348, "step": 5690}, {"loss": 1.1107, "grad_norm": 1.4201879501342773, "learning_rate": 0.0002, "epoch": 5.552849488553337, "step": 5700}, {"loss": 1.0697, "grad_norm": 1.2092000246047974, "learning_rate": 0.0002, "epoch": 5.562591329761325, "step": 5710}, {"loss": 0.9868, "grad_norm": 1.4515851736068726, "learning_rate": 0.0002, "epoch": 5.572333170969313, "step": 5720}, {"loss": 1.1547, "grad_norm": 1.3260412216186523, "learning_rate": 0.0002, "epoch": 5.5820750121773015, "step": 5730}, {"loss": 1.1388, "grad_norm": 1.248191475868225, "learning_rate": 0.0002, "epoch": 5.59181685338529, "step": 5740}, {"loss": 1.0597, "grad_norm": 1.2037307024002075, "learning_rate": 0.0002, "epoch": 5.601558694593278, "step": 5750}, {"loss": 1.1425, "grad_norm": 1.341237187385559, "learning_rate": 0.0002, "epoch": 5.611300535801266, "step": 5760}, {"loss": 1.0942, "grad_norm": 1.130115270614624, "learning_rate": 0.0002, "epoch": 5.621042377009255, "step": 5770}, {"loss": 1.1029, "grad_norm": 1.3834772109985352, "learning_rate": 0.0002, "epoch": 5.630784218217243, "step": 5780}, {"loss": 1.0825, "grad_norm": 1.2586270570755005, "learning_rate": 0.0002, "epoch": 5.640526059425231, "step": 5790}, {"loss": 1.0186, "grad_norm": 1.3233023881912231, "learning_rate": 0.0002, "epoch": 5.65026790063322, "step": 5800}, {"loss": 1.0557, "grad_norm": 1.2711341381072998, "learning_rate": 0.0002, "epoch": 5.660009741841208, "step": 5810}, {"loss": 1.0897, "grad_norm": 1.3867720365524292, "learning_rate": 0.0002, "epoch": 5.669751583049196, "step": 5820}, {"loss": 1.0776, "grad_norm": 1.4783269166946411, "learning_rate": 0.0002, "epoch": 5.679493424257185, "step": 5830}, {"loss": 1.0632, "grad_norm": 1.2744768857955933, "learning_rate": 0.0002, "epoch": 5.6892352654651726, "step": 5840}, {"loss": 1.1484, "grad_norm": 1.3405882120132446, "learning_rate": 0.0002, "epoch": 5.698977106673161, "step": 5850}, {"loss": 1.0975, "grad_norm": 1.204300880432129, "learning_rate": 0.0002, "epoch": 5.70871894788115, "step": 5860}, {"loss": 1.0494, "grad_norm": 1.2954572439193726, "learning_rate": 0.0002, "epoch": 5.7184607890891375, "step": 5870}, {"loss": 1.0643, "grad_norm": 1.5478382110595703, "learning_rate": 0.0002, "epoch": 5.728202630297126, "step": 5880}, {"loss": 1.0582, "grad_norm": 1.2095842361450195, "learning_rate": 0.0002, "epoch": 5.737944471505115, "step": 5890}, {"loss": 1.1, "grad_norm": 1.0691519975662231, "learning_rate": 0.0002, "epoch": 5.747686312713103, "step": 5900}, {"loss": 1.0906, "grad_norm": 1.1920677423477173, "learning_rate": 0.0002, "epoch": 5.757428153921091, "step": 5910}, {"loss": 1.1746, "grad_norm": 1.2051277160644531, "learning_rate": 0.0002, "epoch": 5.76716999512908, "step": 5920}, {"loss": 1.1221, "grad_norm": 1.197490930557251, "learning_rate": 0.0002, "epoch": 5.776911836337067, "step": 5930}, {"loss": 1.07, "grad_norm": 1.2003998756408691, "learning_rate": 0.0002, "epoch": 5.786653677545056, "step": 5940}, {"loss": 1.0938, "grad_norm": 1.2323646545410156, "learning_rate": 0.0002, "epoch": 5.7963955187530445, "step": 5950}, {"loss": 1.1443, "grad_norm": 1.2593932151794434, "learning_rate": 0.0002, "epoch": 5.806137359961033, "step": 5960}, {"loss": 1.0829, "grad_norm": 1.1835976839065552, "learning_rate": 0.0002, "epoch": 5.815879201169021, "step": 5970}, {"loss": 1.1056, "grad_norm": 1.4770104885101318, "learning_rate": 0.0002, "epoch": 5.825621042377009, "step": 5980}, {"loss": 1.1934, "grad_norm": 1.1025809049606323, "learning_rate": 0.0002, "epoch": 5.835362883584997, "step": 5990}, {"loss": 1.1323, "grad_norm": 1.364588975906372, "learning_rate": 0.0002, "epoch": 5.845104724792986, "step": 6000}, {"loss": 1.1234, "grad_norm": 1.2340112924575806, "learning_rate": 0.0002, "epoch": 5.854846566000974, "step": 6010}, {"loss": 1.1123, "grad_norm": 1.4925711154937744, "learning_rate": 0.0002, "epoch": 5.864588407208963, "step": 6020}, {"loss": 1.12, "grad_norm": 1.3516744375228882, "learning_rate": 0.0002, "epoch": 5.874330248416951, "step": 6030}, {"loss": 1.1399, "grad_norm": 1.2058138847351074, "learning_rate": 0.0002, "epoch": 5.884072089624939, "step": 6040}, {"loss": 1.1074, "grad_norm": 1.13870108127594, "learning_rate": 0.0002, "epoch": 5.893813930832927, "step": 6050}, {"loss": 1.088, "grad_norm": 1.1587319374084473, "learning_rate": 0.0002, "epoch": 5.9035557720409155, "step": 6060}, {"loss": 1.1376, "grad_norm": 1.164481520652771, "learning_rate": 0.0002, "epoch": 5.913297613248904, "step": 6070}, {"loss": 1.1262, "grad_norm": 1.2115206718444824, "learning_rate": 0.0002, "epoch": 5.923039454456893, "step": 6080}, {"loss": 1.1345, "grad_norm": 1.3201590776443481, "learning_rate": 0.0002, "epoch": 5.93278129566488, "step": 6090}, {"loss": 1.1288, "grad_norm": 1.287380576133728, "learning_rate": 0.0002, "epoch": 5.942523136872869, "step": 6100}, {"loss": 1.1475, "grad_norm": 1.1820166110992432, "learning_rate": 0.0002, "epoch": 5.952264978080858, "step": 6110}, {"loss": 1.1112, "grad_norm": 1.2550667524337769, "learning_rate": 0.0002, "epoch": 5.962006819288845, "step": 6120}, {"loss": 1.1528, "grad_norm": 1.3547813892364502, "learning_rate": 0.0002, "epoch": 5.971748660496834, "step": 6130}, {"loss": 1.0557, "grad_norm": 1.260842204093933, "learning_rate": 0.0002, "epoch": 5.9814905017048225, "step": 6140}, {"loss": 1.1119, "grad_norm": 1.1643036603927612, "learning_rate": 0.0002, "epoch": 5.99123234291281, "step": 6150}]} +{"epoch": 6.9995129079396, "step": 7185, "epoch_duration": 1628.9270315170288, "total_accumulated_duration": 12510.417803525925, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}, {"eval_loss": 1.8463934659957886, "eval_runtime": 56.2401, "eval_samples_per_second": 9.015, "eval_steps_per_second": 1.138, "epoch": 2.9995129079396006, "step": 3079}, {"loss": 1.5744, "grad_norm": 0.49992576241493225, "learning_rate": 0.0002, "epoch": 3.0004870920603994, "step": 3080}, {"loss": 1.4125, "grad_norm": 0.8242783546447754, "learning_rate": 0.0002, "epoch": 3.0102289332683876, "step": 3090}, {"loss": 1.394, "grad_norm": 0.6330569386482239, "learning_rate": 0.0002, "epoch": 3.019970774476376, "step": 3100}, {"loss": 1.4942, "grad_norm": 0.566097617149353, "learning_rate": 0.0002, "epoch": 3.0297126156843643, "step": 3110}, {"loss": 1.4365, "grad_norm": 0.6337586045265198, "learning_rate": 0.0002, "epoch": 3.0394544568923525, "step": 3120}, {"loss": 1.3916, "grad_norm": 0.7339403033256531, "learning_rate": 0.0002, "epoch": 3.049196298100341, "step": 3130}, {"loss": 1.4617, "grad_norm": 0.7187346816062927, "learning_rate": 0.0002, "epoch": 3.0589381393083293, "step": 3140}, {"loss": 1.3453, "grad_norm": 0.7116255760192871, "learning_rate": 0.0002, "epoch": 3.0686799805163174, "step": 3150}, {"loss": 1.4452, "grad_norm": 0.6493807435035706, "learning_rate": 0.0002, "epoch": 3.078421821724306, "step": 3160}, {"loss": 1.351, "grad_norm": 0.6777266263961792, "learning_rate": 0.0002, "epoch": 3.088163662932294, "step": 3170}, {"loss": 1.4362, "grad_norm": 0.6342006325721741, "learning_rate": 0.0002, "epoch": 3.0979055041402823, "step": 3180}, {"loss": 1.4748, "grad_norm": 0.6608964204788208, "learning_rate": 0.0002, "epoch": 3.107647345348271, "step": 3190}, {"loss": 1.375, "grad_norm": 0.7230247259140015, "learning_rate": 0.0002, "epoch": 3.117389186556259, "step": 3200}, {"loss": 1.4049, "grad_norm": 0.650368332862854, "learning_rate": 0.0002, "epoch": 3.1271310277642472, "step": 3210}, {"loss": 1.409, "grad_norm": 0.7319342494010925, "learning_rate": 0.0002, "epoch": 3.136872868972236, "step": 3220}, {"loss": 1.3872, "grad_norm": 0.7159963846206665, "learning_rate": 0.0002, "epoch": 3.146614710180224, "step": 3230}, {"loss": 1.5076, "grad_norm": 0.8905230164527893, "learning_rate": 0.0002, "epoch": 3.156356551388212, "step": 3240}, {"loss": 1.3161, "grad_norm": 0.6920804381370544, "learning_rate": 0.0002, "epoch": 3.1660983925962007, "step": 3250}, {"loss": 1.3786, "grad_norm": 0.6782063841819763, "learning_rate": 0.0002, "epoch": 3.175840233804189, "step": 3260}, {"loss": 1.5153, "grad_norm": 0.735325276851654, "learning_rate": 0.0002, "epoch": 3.1855820750121775, "step": 3270}, {"loss": 1.4027, "grad_norm": 0.6657978296279907, "learning_rate": 0.0002, "epoch": 3.1953239162201656, "step": 3280}, {"loss": 1.3456, "grad_norm": 0.771315336227417, "learning_rate": 0.0002, "epoch": 3.205065757428154, "step": 3290}, {"loss": 1.3236, "grad_norm": 0.6492983102798462, "learning_rate": 0.0002, "epoch": 3.2148075986361424, "step": 3300}, {"loss": 1.4125, "grad_norm": 0.7513770461082458, "learning_rate": 0.0002, "epoch": 3.2245494398441306, "step": 3310}, {"loss": 1.4032, "grad_norm": 0.7091423869132996, "learning_rate": 0.0002, "epoch": 3.2342912810521187, "step": 3320}, {"loss": 1.4585, "grad_norm": 0.6663975119590759, "learning_rate": 0.0002, "epoch": 3.2440331222601073, "step": 3330}, {"loss": 1.3968, "grad_norm": 0.6813122034072876, "learning_rate": 0.0002, "epoch": 3.2537749634680955, "step": 3340}, {"loss": 1.3681, "grad_norm": 0.6602569818496704, "learning_rate": 0.0002, "epoch": 3.2635168046760836, "step": 3350}, {"loss": 1.4533, "grad_norm": 0.718270480632782, "learning_rate": 0.0002, "epoch": 3.2732586458840722, "step": 3360}, {"loss": 1.4076, "grad_norm": 0.6884173154830933, "learning_rate": 0.0002, "epoch": 3.2830004870920604, "step": 3370}, {"loss": 1.4144, "grad_norm": 0.7039775848388672, "learning_rate": 0.0002, "epoch": 3.2927423283000485, "step": 3380}, {"loss": 1.5077, "grad_norm": 0.7444299459457397, "learning_rate": 0.0002, "epoch": 3.302484169508037, "step": 3390}, {"loss": 1.4255, "grad_norm": 0.7187064290046692, "learning_rate": 0.0002, "epoch": 3.3122260107160253, "step": 3400}, {"loss": 1.3684, "grad_norm": 0.599396288394928, "learning_rate": 0.0002, "epoch": 3.3219678519240134, "step": 3410}, {"loss": 1.4819, "grad_norm": 0.7670390009880066, "learning_rate": 0.0002, "epoch": 3.331709693132002, "step": 3420}, {"loss": 1.4411, "grad_norm": 0.6654478311538696, "learning_rate": 0.0002, "epoch": 3.34145153433999, "step": 3430}, {"loss": 1.4257, "grad_norm": 0.6644385457038879, "learning_rate": 0.0002, "epoch": 3.351193375547979, "step": 3440}, {"loss": 1.4508, "grad_norm": 0.6974098086357117, "learning_rate": 0.0002, "epoch": 3.360935216755967, "step": 3450}, {"loss": 1.3807, "grad_norm": 0.7350399494171143, "learning_rate": 0.0002, "epoch": 3.370677057963955, "step": 3460}, {"loss": 1.4176, "grad_norm": 0.714721143245697, "learning_rate": 0.0002, "epoch": 3.3804188991719437, "step": 3470}, {"loss": 1.4325, "grad_norm": 0.7006027698516846, "learning_rate": 0.0002, "epoch": 3.390160740379932, "step": 3480}, {"loss": 1.4888, "grad_norm": 0.6767925024032593, "learning_rate": 0.0002, "epoch": 3.39990258158792, "step": 3490}, {"loss": 1.4116, "grad_norm": 0.6721355319023132, "learning_rate": 0.0002, "epoch": 3.4096444227959086, "step": 3500}, {"loss": 1.443, "grad_norm": 0.6845725178718567, "learning_rate": 0.0002, "epoch": 3.419386264003897, "step": 3510}, {"loss": 1.4832, "grad_norm": 0.6882196664810181, "learning_rate": 0.0002, "epoch": 3.429128105211885, "step": 3520}, {"loss": 1.4962, "grad_norm": 0.7663240432739258, "learning_rate": 0.0002, "epoch": 3.4388699464198735, "step": 3530}, {"loss": 1.4644, "grad_norm": 0.6304219365119934, "learning_rate": 0.0002, "epoch": 3.4486117876278617, "step": 3540}, {"loss": 1.4918, "grad_norm": 0.668678879737854, "learning_rate": 0.0002, "epoch": 3.45835362883585, "step": 3550}, {"loss": 1.4874, "grad_norm": 0.7526912093162537, "learning_rate": 0.0002, "epoch": 3.4680954700438384, "step": 3560}, {"loss": 1.4249, "grad_norm": 1.089495301246643, "learning_rate": 0.0002, "epoch": 3.4778373112518266, "step": 3570}, {"loss": 1.3871, "grad_norm": 0.7282902002334595, "learning_rate": 0.0002, "epoch": 3.4875791524598148, "step": 3580}, {"loss": 1.5077, "grad_norm": 0.6540156602859497, "learning_rate": 0.0002, "epoch": 3.4973209936678034, "step": 3590}, {"loss": 1.4367, "grad_norm": 0.6449568867683411, "learning_rate": 0.0002, "epoch": 3.5070628348757915, "step": 3600}, {"loss": 1.4532, "grad_norm": 0.7262216210365295, "learning_rate": 0.0002, "epoch": 3.5168046760837797, "step": 3610}, {"loss": 1.4374, "grad_norm": 0.6048615574836731, "learning_rate": 0.0002, "epoch": 3.5265465172917683, "step": 3620}, {"loss": 1.3877, "grad_norm": 0.6780537366867065, "learning_rate": 0.0002, "epoch": 3.5362883584997564, "step": 3630}, {"loss": 1.422, "grad_norm": 0.6851925253868103, "learning_rate": 0.0002, "epoch": 3.5460301997077446, "step": 3640}, {"loss": 1.3425, "grad_norm": 0.6530634164810181, "learning_rate": 0.0002, "epoch": 3.555772040915733, "step": 3650}, {"loss": 1.4879, "grad_norm": 0.7193992733955383, "learning_rate": 0.0002, "epoch": 3.5655138821237213, "step": 3660}, {"loss": 1.4555, "grad_norm": 0.767496645450592, "learning_rate": 0.0002, "epoch": 3.5752557233317095, "step": 3670}, {"loss": 1.4824, "grad_norm": 0.6912919282913208, "learning_rate": 0.0002, "epoch": 3.584997564539698, "step": 3680}, {"loss": 1.4497, "grad_norm": 0.7383436560630798, "learning_rate": 0.0002, "epoch": 3.5947394057476862, "step": 3690}, {"loss": 1.4822, "grad_norm": 0.6746662855148315, "learning_rate": 0.0002, "epoch": 3.6044812469556744, "step": 3700}, {"loss": 1.4904, "grad_norm": 0.6885138750076294, "learning_rate": 0.0002, "epoch": 3.614223088163663, "step": 3710}, {"loss": 1.4044, "grad_norm": 0.6694392561912537, "learning_rate": 0.0002, "epoch": 3.623964929371651, "step": 3720}, {"loss": 1.3719, "grad_norm": 0.812358021736145, "learning_rate": 0.0002, "epoch": 3.6337067705796393, "step": 3730}, {"loss": 1.4603, "grad_norm": 0.7267130017280579, "learning_rate": 0.0002, "epoch": 3.643448611787628, "step": 3740}, {"loss": 1.4574, "grad_norm": 0.6958749294281006, "learning_rate": 0.0002, "epoch": 3.653190452995616, "step": 3750}, {"loss": 1.4346, "grad_norm": 0.6805673241615295, "learning_rate": 0.0002, "epoch": 3.6629322942036042, "step": 3760}, {"loss": 1.4338, "grad_norm": 0.7184410095214844, "learning_rate": 0.0002, "epoch": 3.672674135411593, "step": 3770}, {"loss": 1.3935, "grad_norm": 0.7716330289840698, "learning_rate": 0.0002, "epoch": 3.682415976619581, "step": 3780}, {"loss": 1.384, "grad_norm": 0.6675831079483032, "learning_rate": 0.0002, "epoch": 3.6921578178275696, "step": 3790}, {"loss": 1.401, "grad_norm": 0.6480095386505127, "learning_rate": 0.0002, "epoch": 3.7018996590355577, "step": 3800}, {"loss": 1.5303, "grad_norm": 0.6559418439865112, "learning_rate": 0.0002, "epoch": 3.711641500243546, "step": 3810}, {"loss": 1.4341, "grad_norm": 0.6596545577049255, "learning_rate": 0.0002, "epoch": 3.7213833414515345, "step": 3820}, {"loss": 1.4508, "grad_norm": 0.7172950506210327, "learning_rate": 0.0002, "epoch": 3.7311251826595226, "step": 3830}, {"loss": 1.446, "grad_norm": 0.796148419380188, "learning_rate": 0.0002, "epoch": 3.740867023867511, "step": 3840}, {"loss": 1.4992, "grad_norm": 0.6600322723388672, "learning_rate": 0.0002, "epoch": 3.7506088650754994, "step": 3850}, {"loss": 1.4201, "grad_norm": 0.6776387691497803, "learning_rate": 0.0002, "epoch": 3.7603507062834876, "step": 3860}, {"loss": 1.3893, "grad_norm": 0.7768304347991943, "learning_rate": 0.0002, "epoch": 3.770092547491476, "step": 3870}, {"loss": 1.4886, "grad_norm": 1.0579794645309448, "learning_rate": 0.0002, "epoch": 3.7798343886994643, "step": 3880}, {"loss": 1.4556, "grad_norm": 0.6757252812385559, "learning_rate": 0.0002, "epoch": 3.7895762299074525, "step": 3890}, {"loss": 1.4647, "grad_norm": 0.6706996560096741, "learning_rate": 0.0002, "epoch": 3.799318071115441, "step": 3900}, {"loss": 1.4104, "grad_norm": 0.7026948928833008, "learning_rate": 0.0002, "epoch": 3.809059912323429, "step": 3910}, {"loss": 1.5487, "grad_norm": 0.6437768340110779, "learning_rate": 0.0002, "epoch": 3.8188017535314174, "step": 3920}, {"loss": 1.4678, "grad_norm": 0.7015706300735474, "learning_rate": 0.0002, "epoch": 3.828543594739406, "step": 3930}, {"loss": 1.4891, "grad_norm": 0.7049482464790344, "learning_rate": 0.0002, "epoch": 3.838285435947394, "step": 3940}, {"loss": 1.4208, "grad_norm": 0.6533724665641785, "learning_rate": 0.0002, "epoch": 3.8480272771553823, "step": 3950}, {"loss": 1.4435, "grad_norm": 0.7312499284744263, "learning_rate": 0.0002, "epoch": 3.857769118363371, "step": 3960}, {"loss": 1.3886, "grad_norm": 0.6858801245689392, "learning_rate": 0.0002, "epoch": 3.867510959571359, "step": 3970}, {"loss": 1.4423, "grad_norm": 0.770423173904419, "learning_rate": 0.0002, "epoch": 3.877252800779347, "step": 3980}, {"loss": 1.5029, "grad_norm": 0.6987539529800415, "learning_rate": 0.0002, "epoch": 3.886994641987336, "step": 3990}, {"loss": 1.4791, "grad_norm": 0.7072722315788269, "learning_rate": 0.0002, "epoch": 3.896736483195324, "step": 4000}, {"loss": 1.528, "grad_norm": 0.6492931842803955, "learning_rate": 0.0002, "epoch": 3.906478324403312, "step": 4010}, {"loss": 1.3824, "grad_norm": 0.7716232538223267, "learning_rate": 0.0002, "epoch": 3.9162201656113007, "step": 4020}, {"loss": 1.4758, "grad_norm": 0.722949743270874, "learning_rate": 0.0002, "epoch": 3.925962006819289, "step": 4030}, {"loss": 1.3914, "grad_norm": 0.7434365749359131, "learning_rate": 0.0002, "epoch": 3.935703848027277, "step": 4040}, {"loss": 1.4763, "grad_norm": 0.6691509485244751, "learning_rate": 0.0002, "epoch": 3.9454456892352656, "step": 4050}, {"loss": 1.4555, "grad_norm": 0.6850284337997437, "learning_rate": 0.0002, "epoch": 3.9551875304432538, "step": 4060}, {"loss": 1.5275, "grad_norm": 0.6954452991485596, "learning_rate": 0.0002, "epoch": 3.964929371651242, "step": 4070}, {"loss": 1.417, "grad_norm": 0.9316364526748657, "learning_rate": 0.0002, "epoch": 3.9746712128592305, "step": 4080}, {"loss": 1.4532, "grad_norm": 0.6908289194107056, "learning_rate": 0.0002, "epoch": 3.9844130540672187, "step": 4090}, {"loss": 1.4404, "grad_norm": 0.666782021522522, "learning_rate": 0.0002, "epoch": 3.994154895275207, "step": 4100}, {"eval_loss": 1.9233275651931763, "eval_runtime": 55.9536, "eval_samples_per_second": 9.061, "eval_steps_per_second": 1.144, "epoch": 4.0, "step": 4106}, {"loss": 1.3489, "grad_norm": 0.7726166248321533, "learning_rate": 0.0002, "epoch": 4.003896736483195, "step": 4110}, {"loss": 1.1415, "grad_norm": 1.1338967084884644, "learning_rate": 0.0002, "epoch": 4.013638577691184, "step": 4120}, {"loss": 1.2212, "grad_norm": 0.9530029296875, "learning_rate": 0.0002, "epoch": 4.023380418899172, "step": 4130}, {"loss": 1.2002, "grad_norm": 1.1058554649353027, "learning_rate": 0.0002, "epoch": 4.03312226010716, "step": 4140}, {"loss": 1.2381, "grad_norm": 0.8765049576759338, "learning_rate": 0.0002, "epoch": 4.042864101315149, "step": 4150}, {"loss": 1.2708, "grad_norm": 1.1774667501449585, "learning_rate": 0.0002, "epoch": 4.052605942523137, "step": 4160}, {"loss": 1.2116, "grad_norm": 0.9301433563232422, "learning_rate": 0.0002, "epoch": 4.062347783731125, "step": 4170}, {"loss": 1.1807, "grad_norm": 1.0196778774261475, "learning_rate": 0.0002, "epoch": 4.072089624939114, "step": 4180}, {"loss": 1.2602, "grad_norm": 1.1380577087402344, "learning_rate": 0.0002, "epoch": 4.081831466147102, "step": 4190}, {"loss": 1.2521, "grad_norm": 0.9121319651603699, "learning_rate": 0.0002, "epoch": 4.09157330735509, "step": 4200}, {"loss": 1.1747, "grad_norm": 0.9495378732681274, "learning_rate": 0.0002, "epoch": 4.101315148563079, "step": 4210}, {"loss": 1.1829, "grad_norm": 0.8058680295944214, "learning_rate": 0.0002, "epoch": 4.1110569897710665, "step": 4220}, {"loss": 1.1732, "grad_norm": 1.000887393951416, "learning_rate": 0.0002, "epoch": 4.120798830979055, "step": 4230}, {"loss": 1.1947, "grad_norm": 0.9529102444648743, "learning_rate": 0.0002, "epoch": 4.130540672187044, "step": 4240}, {"loss": 1.2104, "grad_norm": 1.0257115364074707, "learning_rate": 0.0002, "epoch": 4.140282513395031, "step": 4250}, {"loss": 1.2293, "grad_norm": 0.9590303897857666, "learning_rate": 0.0002, "epoch": 4.15002435460302, "step": 4260}, {"loss": 1.1918, "grad_norm": 1.065291166305542, "learning_rate": 0.0002, "epoch": 4.159766195811009, "step": 4270}, {"loss": 1.2323, "grad_norm": 0.8819697499275208, "learning_rate": 0.0002, "epoch": 4.169508037018996, "step": 4280}, {"loss": 1.2167, "grad_norm": 1.0335261821746826, "learning_rate": 0.0002, "epoch": 4.179249878226985, "step": 4290}, {"loss": 1.2131, "grad_norm": 0.8872809410095215, "learning_rate": 0.0002, "epoch": 4.1889917194349735, "step": 4300}, {"loss": 1.2794, "grad_norm": 0.9883159399032593, "learning_rate": 0.0002, "epoch": 4.198733560642961, "step": 4310}, {"loss": 1.2544, "grad_norm": 1.0254192352294922, "learning_rate": 0.0002, "epoch": 4.20847540185095, "step": 4320}, {"loss": 1.2595, "grad_norm": 0.9432600736618042, "learning_rate": 0.0002, "epoch": 4.218217243058938, "step": 4330}, {"loss": 1.2684, "grad_norm": 1.1008676290512085, "learning_rate": 0.0002, "epoch": 4.227959084266926, "step": 4340}, {"loss": 1.2149, "grad_norm": 1.0829699039459229, "learning_rate": 0.0002, "epoch": 4.237700925474915, "step": 4350}, {"loss": 1.2621, "grad_norm": 1.016847848892212, "learning_rate": 0.0002, "epoch": 4.247442766682903, "step": 4360}, {"loss": 1.2375, "grad_norm": 0.8924864530563354, "learning_rate": 0.0002, "epoch": 4.257184607890891, "step": 4370}, {"loss": 1.1987, "grad_norm": 0.9300530552864075, "learning_rate": 0.0002, "epoch": 4.26692644909888, "step": 4380}, {"loss": 1.1696, "grad_norm": 0.9684814810752869, "learning_rate": 0.0002, "epoch": 4.276668290306868, "step": 4390}, {"loss": 1.2006, "grad_norm": 0.9916250705718994, "learning_rate": 0.0002, "epoch": 4.286410131514856, "step": 4400}, {"loss": 1.2402, "grad_norm": 0.903680145740509, "learning_rate": 0.0002, "epoch": 4.2961519727228445, "step": 4410}, {"loss": 1.2022, "grad_norm": 0.8713505268096924, "learning_rate": 0.0002, "epoch": 4.305893813930833, "step": 4420}, {"loss": 1.1957, "grad_norm": 0.9983905553817749, "learning_rate": 0.0002, "epoch": 4.315635655138821, "step": 4430}, {"loss": 1.2676, "grad_norm": 1.1689040660858154, "learning_rate": 0.0002, "epoch": 4.3253774963468095, "step": 4440}, {"loss": 1.2166, "grad_norm": 0.9316853880882263, "learning_rate": 0.0002, "epoch": 4.335119337554798, "step": 4450}, {"loss": 1.222, "grad_norm": 0.9175887107849121, "learning_rate": 0.0002, "epoch": 4.344861178762786, "step": 4460}, {"loss": 1.2571, "grad_norm": 0.9348906874656677, "learning_rate": 0.0002, "epoch": 4.354603019970774, "step": 4470}, {"loss": 1.2764, "grad_norm": 0.9727016687393188, "learning_rate": 0.0002, "epoch": 4.364344861178763, "step": 4480}, {"loss": 1.2616, "grad_norm": 0.9843429923057556, "learning_rate": 0.0002, "epoch": 4.374086702386751, "step": 4490}, {"loss": 1.2488, "grad_norm": 0.9615852236747742, "learning_rate": 0.0002, "epoch": 4.383828543594739, "step": 4500}, {"loss": 1.1718, "grad_norm": 0.9688583612442017, "learning_rate": 0.0002, "epoch": 4.393570384802728, "step": 4510}, {"loss": 1.2546, "grad_norm": 0.9933668375015259, "learning_rate": 0.0002, "epoch": 4.403312226010716, "step": 4520}, {"loss": 1.2355, "grad_norm": 1.0626686811447144, "learning_rate": 0.0002, "epoch": 4.413054067218704, "step": 4530}, {"loss": 1.2425, "grad_norm": 0.9536267518997192, "learning_rate": 0.0002, "epoch": 4.422795908426693, "step": 4540}, {"loss": 1.2562, "grad_norm": 0.9777140021324158, "learning_rate": 0.0002, "epoch": 4.432537749634681, "step": 4550}, {"loss": 1.2878, "grad_norm": 0.980780839920044, "learning_rate": 0.0002, "epoch": 4.442279590842669, "step": 4560}, {"loss": 1.2597, "grad_norm": 1.0147196054458618, "learning_rate": 0.0002, "epoch": 4.452021432050658, "step": 4570}, {"loss": 1.2148, "grad_norm": 0.9763361811637878, "learning_rate": 0.0002, "epoch": 4.461763273258645, "step": 4580}, {"loss": 1.3076, "grad_norm": 1.0300798416137695, "learning_rate": 0.0002, "epoch": 4.471505114466634, "step": 4590}, {"loss": 1.2665, "grad_norm": 0.8833121657371521, "learning_rate": 0.0002, "epoch": 4.481246955674623, "step": 4600}, {"loss": 1.1899, "grad_norm": 1.1214020252227783, "learning_rate": 0.0002, "epoch": 4.490988796882611, "step": 4610}, {"loss": 1.2579, "grad_norm": 0.8843787908554077, "learning_rate": 0.0002, "epoch": 4.500730638090599, "step": 4620}, {"loss": 1.2633, "grad_norm": 0.9942020773887634, "learning_rate": 0.0002, "epoch": 4.5104724792985875, "step": 4630}, {"loss": 1.3172, "grad_norm": 1.0033202171325684, "learning_rate": 0.0002, "epoch": 4.520214320506576, "step": 4640}, {"loss": 1.2024, "grad_norm": 0.8767235279083252, "learning_rate": 0.0002, "epoch": 4.529956161714564, "step": 4650}, {"loss": 1.2714, "grad_norm": 1.0117276906967163, "learning_rate": 0.0002, "epoch": 4.539698002922552, "step": 4660}, {"loss": 1.2911, "grad_norm": 1.2787362337112427, "learning_rate": 0.0002, "epoch": 4.549439844130541, "step": 4670}, {"loss": 1.2603, "grad_norm": 0.8824878931045532, "learning_rate": 0.0002, "epoch": 4.559181685338529, "step": 4680}, {"loss": 1.2905, "grad_norm": 0.9209560751914978, "learning_rate": 0.0002, "epoch": 4.568923526546517, "step": 4690}, {"loss": 1.1916, "grad_norm": 1.1064010858535767, "learning_rate": 0.0002, "epoch": 4.578665367754506, "step": 4700}, {"loss": 1.2217, "grad_norm": 0.8914572596549988, "learning_rate": 0.0002, "epoch": 4.588407208962494, "step": 4710}, {"loss": 1.2861, "grad_norm": 1.0412265062332153, "learning_rate": 0.0002, "epoch": 4.598149050170482, "step": 4720}, {"loss": 1.262, "grad_norm": 1.1950221061706543, "learning_rate": 0.0002, "epoch": 4.607890891378471, "step": 4730}, {"loss": 1.2659, "grad_norm": 0.8938062787055969, "learning_rate": 0.0002, "epoch": 4.617632732586459, "step": 4740}, {"loss": 1.2621, "grad_norm": 0.9849569201469421, "learning_rate": 0.0002, "epoch": 4.627374573794447, "step": 4750}, {"loss": 1.2341, "grad_norm": 1.0081515312194824, "learning_rate": 0.0002, "epoch": 4.637116415002436, "step": 4760}, {"loss": 1.2023, "grad_norm": 0.8566309213638306, "learning_rate": 0.0002, "epoch": 4.6468582562104235, "step": 4770}, {"loss": 1.2723, "grad_norm": 1.1750118732452393, "learning_rate": 0.0002, "epoch": 4.656600097418412, "step": 4780}, {"loss": 1.2537, "grad_norm": 0.925502598285675, "learning_rate": 0.0002, "epoch": 4.666341938626401, "step": 4790}, {"loss": 1.2146, "grad_norm": 1.0402472019195557, "learning_rate": 0.0002, "epoch": 4.676083779834388, "step": 4800}, {"loss": 1.2555, "grad_norm": 0.9772472977638245, "learning_rate": 0.0002, "epoch": 4.685825621042377, "step": 4810}, {"loss": 1.2667, "grad_norm": 0.9082779288291931, "learning_rate": 0.0002, "epoch": 4.695567462250366, "step": 4820}, {"loss": 1.2465, "grad_norm": 0.8026862740516663, "learning_rate": 0.0002, "epoch": 4.705309303458353, "step": 4830}, {"loss": 1.3369, "grad_norm": 1.1631089448928833, "learning_rate": 0.0002, "epoch": 4.715051144666342, "step": 4840}, {"loss": 1.261, "grad_norm": 0.9384787678718567, "learning_rate": 0.0002, "epoch": 4.7247929858743305, "step": 4850}, {"loss": 1.2588, "grad_norm": 1.2151581048965454, "learning_rate": 0.0002, "epoch": 4.734534827082318, "step": 4860}, {"loss": 1.363, "grad_norm": 0.9679436087608337, "learning_rate": 0.0002, "epoch": 4.744276668290307, "step": 4870}, {"loss": 1.3292, "grad_norm": 0.8352158069610596, "learning_rate": 0.0002, "epoch": 4.754018509498295, "step": 4880}, {"loss": 1.3056, "grad_norm": 1.0205804109573364, "learning_rate": 0.0002, "epoch": 4.763760350706283, "step": 4890}, {"loss": 1.223, "grad_norm": 0.9814772605895996, "learning_rate": 0.0002, "epoch": 4.773502191914272, "step": 4900}, {"loss": 1.3114, "grad_norm": 1.002854347229004, "learning_rate": 0.0002, "epoch": 4.78324403312226, "step": 4910}, {"loss": 1.3143, "grad_norm": 1.1609505414962769, "learning_rate": 0.0002, "epoch": 4.792985874330248, "step": 4920}, {"loss": 1.3166, "grad_norm": 0.9354982376098633, "learning_rate": 0.0002, "epoch": 4.802727715538237, "step": 4930}, {"loss": 1.2978, "grad_norm": 0.9761685729026794, "learning_rate": 0.0002, "epoch": 4.812469556746225, "step": 4940}, {"loss": 1.2709, "grad_norm": 1.0604596138000488, "learning_rate": 0.0002, "epoch": 4.822211397954213, "step": 4950}, {"loss": 1.2765, "grad_norm": 1.0902808904647827, "learning_rate": 0.0002, "epoch": 4.8319532391622015, "step": 4960}, {"loss": 1.3073, "grad_norm": 1.0174955129623413, "learning_rate": 0.0002, "epoch": 4.84169508037019, "step": 4970}, {"loss": 1.3141, "grad_norm": 1.0995253324508667, "learning_rate": 0.0002, "epoch": 4.851436921578179, "step": 4980}, {"loss": 1.3006, "grad_norm": 0.880993127822876, "learning_rate": 0.0002, "epoch": 4.8611787627861665, "step": 4990}, {"loss": 1.2547, "grad_norm": 0.9472237825393677, "learning_rate": 0.0002, "epoch": 4.870920603994155, "step": 5000}, {"loss": 1.4078, "grad_norm": 0.9504236578941345, "learning_rate": 0.0002, "epoch": 4.880662445202143, "step": 5010}, {"loss": 1.2791, "grad_norm": 1.1261742115020752, "learning_rate": 0.0002, "epoch": 4.890404286410131, "step": 5020}, {"loss": 1.3707, "grad_norm": 0.904674768447876, "learning_rate": 0.0002, "epoch": 4.90014612761812, "step": 5030}, {"loss": 1.2762, "grad_norm": 0.8828991055488586, "learning_rate": 0.0002, "epoch": 4.909887968826109, "step": 5040}, {"loss": 1.2905, "grad_norm": 1.0156532526016235, "learning_rate": 0.0002, "epoch": 4.919629810034096, "step": 5050}, {"loss": 1.3079, "grad_norm": 0.8975168466567993, "learning_rate": 0.0002, "epoch": 4.929371651242085, "step": 5060}, {"loss": 1.3322, "grad_norm": 0.9787213802337646, "learning_rate": 0.0002, "epoch": 4.939113492450073, "step": 5070}, {"loss": 1.2533, "grad_norm": 1.0801568031311035, "learning_rate": 0.0002, "epoch": 4.948855333658061, "step": 5080}, {"loss": 1.238, "grad_norm": 1.0655089616775513, "learning_rate": 0.0002, "epoch": 4.95859717486605, "step": 5090}, {"loss": 1.2449, "grad_norm": 0.8941320180892944, "learning_rate": 0.0002, "epoch": 4.968339016074038, "step": 5100}, {"loss": 1.2846, "grad_norm": 1.050621747970581, "learning_rate": 0.0002, "epoch": 4.978080857282026, "step": 5110}, {"loss": 1.3791, "grad_norm": 0.9724781513214111, "learning_rate": 0.0002, "epoch": 4.987822698490015, "step": 5120}, {"loss": 1.292, "grad_norm": 0.9850538969039917, "learning_rate": 0.0002, "epoch": 4.997564539698003, "step": 5130}, {"eval_loss": 2.0824170112609863, "eval_runtime": 55.592, "eval_samples_per_second": 9.12, "eval_steps_per_second": 1.151, "epoch": 4.9995129079396, "step": 5132}, {"loss": 1.037, "grad_norm": 1.0096189975738525, "learning_rate": 0.0002, "epoch": 5.007306380905991, "step": 5140}, {"loss": 1.0003, "grad_norm": 1.2403408288955688, "learning_rate": 0.0002, "epoch": 5.01704822211398, "step": 5150}, {"loss": 1.0129, "grad_norm": 1.1243221759796143, "learning_rate": 0.0002, "epoch": 5.026790063321968, "step": 5160}, {"loss": 0.9815, "grad_norm": 1.4745502471923828, "learning_rate": 0.0002, "epoch": 5.036531904529956, "step": 5170}, {"loss": 0.9715, "grad_norm": 1.1913198232650757, "learning_rate": 0.0002, "epoch": 5.0462737457379445, "step": 5180}, {"loss": 0.9282, "grad_norm": 1.2732855081558228, "learning_rate": 0.0002, "epoch": 5.056015586945933, "step": 5190}, {"loss": 0.9857, "grad_norm": 1.1737396717071533, "learning_rate": 0.0002, "epoch": 5.065757428153921, "step": 5200}, {"loss": 0.9754, "grad_norm": 1.4162768125534058, "learning_rate": 0.0002, "epoch": 5.075499269361909, "step": 5210}, {"loss": 1.0333, "grad_norm": 1.528274655342102, "learning_rate": 0.0002, "epoch": 5.085241110569898, "step": 5220}, {"loss": 1.0227, "grad_norm": 1.3966618776321411, "learning_rate": 0.0002, "epoch": 5.094982951777886, "step": 5230}, {"loss": 0.987, "grad_norm": 1.3427953720092773, "learning_rate": 0.0002, "epoch": 5.104724792985874, "step": 5240}, {"loss": 1.0353, "grad_norm": 1.6533905267715454, "learning_rate": 0.0002, "epoch": 5.114466634193863, "step": 5250}, {"loss": 1.0452, "grad_norm": 1.4114865064620972, "learning_rate": 0.0002, "epoch": 5.124208475401851, "step": 5260}, {"loss": 1.067, "grad_norm": 1.5460708141326904, "learning_rate": 0.0002, "epoch": 5.133950316609839, "step": 5270}, {"loss": 1.0667, "grad_norm": 1.3491919040679932, "learning_rate": 0.0002, "epoch": 5.143692157817828, "step": 5280}, {"loss": 0.9957, "grad_norm": 1.2208969593048096, "learning_rate": 0.0002, "epoch": 5.153433999025816, "step": 5290}, {"loss": 1.0362, "grad_norm": 1.1141403913497925, "learning_rate": 0.0002, "epoch": 5.163175840233804, "step": 5300}, {"loss": 0.9744, "grad_norm": 1.2938064336776733, "learning_rate": 0.0002, "epoch": 5.172917681441793, "step": 5310}, {"loss": 1.0438, "grad_norm": 1.2704918384552002, "learning_rate": 0.0002, "epoch": 5.1826595226497805, "step": 5320}, {"loss": 1.0015, "grad_norm": 1.3928544521331787, "learning_rate": 0.0002, "epoch": 5.192401363857769, "step": 5330}, {"loss": 1.025, "grad_norm": 1.1993824243545532, "learning_rate": 0.0002, "epoch": 5.202143205065758, "step": 5340}, {"loss": 1.0195, "grad_norm": 1.5913670063018799, "learning_rate": 0.0002, "epoch": 5.211885046273745, "step": 5350}, {"loss": 1.0113, "grad_norm": 1.1577855348587036, "learning_rate": 0.0002, "epoch": 5.221626887481734, "step": 5360}, {"loss": 1.0684, "grad_norm": 1.4535993337631226, "learning_rate": 0.0002, "epoch": 5.231368728689723, "step": 5370}, {"loss": 1.0255, "grad_norm": 1.5068976879119873, "learning_rate": 0.0002, "epoch": 5.24111056989771, "step": 5380}, {"loss": 1.0068, "grad_norm": 1.2365459203720093, "learning_rate": 0.0002, "epoch": 5.250852411105699, "step": 5390}, {"loss": 1.0145, "grad_norm": 1.3197922706604004, "learning_rate": 0.0002, "epoch": 5.2605942523136875, "step": 5400}, {"loss": 1.0767, "grad_norm": 1.2395117282867432, "learning_rate": 0.0002, "epoch": 5.270336093521675, "step": 5410}, {"loss": 1.0292, "grad_norm": 1.1841236352920532, "learning_rate": 0.0002, "epoch": 5.280077934729664, "step": 5420}, {"loss": 1.0233, "grad_norm": 1.218003749847412, "learning_rate": 0.0002, "epoch": 5.289819775937652, "step": 5430}, {"loss": 1.0093, "grad_norm": 1.2210947275161743, "learning_rate": 0.0002, "epoch": 5.29956161714564, "step": 5440}, {"loss": 0.9619, "grad_norm": 1.266006588935852, "learning_rate": 0.0002, "epoch": 5.309303458353629, "step": 5450}, {"loss": 1.0352, "grad_norm": 1.2598075866699219, "learning_rate": 0.0002, "epoch": 5.319045299561617, "step": 5460}, {"loss": 1.0929, "grad_norm": 1.2410019636154175, "learning_rate": 0.0002, "epoch": 5.328787140769606, "step": 5470}, {"loss": 1.058, "grad_norm": 1.249698519706726, "learning_rate": 0.0002, "epoch": 5.338528981977594, "step": 5480}, {"loss": 1.0457, "grad_norm": 1.2398173809051514, "learning_rate": 0.0002, "epoch": 5.348270823185582, "step": 5490}, {"loss": 1.0139, "grad_norm": 1.2416654825210571, "learning_rate": 0.0002, "epoch": 5.35801266439357, "step": 5500}, {"loss": 1.0609, "grad_norm": 1.398706316947937, "learning_rate": 0.0002, "epoch": 5.3677545056015585, "step": 5510}, {"loss": 1.0512, "grad_norm": 1.3049418926239014, "learning_rate": 0.0002, "epoch": 5.377496346809547, "step": 5520}, {"loss": 1.0912, "grad_norm": 1.2528893947601318, "learning_rate": 0.0002, "epoch": 5.387238188017536, "step": 5530}, {"loss": 1.0619, "grad_norm": 1.2963255643844604, "learning_rate": 0.0002, "epoch": 5.3969800292255234, "step": 5540}, {"loss": 1.0194, "grad_norm": 1.494231104850769, "learning_rate": 0.0002, "epoch": 5.406721870433512, "step": 5550}, {"loss": 1.0179, "grad_norm": 1.2760992050170898, "learning_rate": 0.0002, "epoch": 5.416463711641501, "step": 5560}, {"loss": 1.1088, "grad_norm": 1.195292592048645, "learning_rate": 0.0002, "epoch": 5.426205552849488, "step": 5570}, {"loss": 1.0859, "grad_norm": 1.6408965587615967, "learning_rate": 0.0002, "epoch": 5.435947394057477, "step": 5580}, {"loss": 1.0868, "grad_norm": 1.3092058897018433, "learning_rate": 0.0002, "epoch": 5.4456892352654656, "step": 5590}, {"loss": 1.006, "grad_norm": 1.2960586547851562, "learning_rate": 0.0002, "epoch": 5.455431076473453, "step": 5600}, {"loss": 1.0257, "grad_norm": 1.3560487031936646, "learning_rate": 0.0002, "epoch": 5.465172917681442, "step": 5610}, {"loss": 1.0314, "grad_norm": 1.1896311044692993, "learning_rate": 0.0002, "epoch": 5.4749147588894305, "step": 5620}, {"loss": 1.0435, "grad_norm": 1.3145595788955688, "learning_rate": 0.0002, "epoch": 5.484656600097418, "step": 5630}, {"loss": 1.0456, "grad_norm": 1.2207404375076294, "learning_rate": 0.0002, "epoch": 5.494398441305407, "step": 5640}, {"loss": 1.0823, "grad_norm": 1.266015887260437, "learning_rate": 0.0002, "epoch": 5.504140282513395, "step": 5650}, {"loss": 1.0696, "grad_norm": 1.2478289604187012, "learning_rate": 0.0002, "epoch": 5.513882123721383, "step": 5660}, {"loss": 1.0695, "grad_norm": 1.4851372241973877, "learning_rate": 0.0002, "epoch": 5.523623964929372, "step": 5670}, {"loss": 1.0736, "grad_norm": 1.4478679895401, "learning_rate": 0.0002, "epoch": 5.53336580613736, "step": 5680}, {"loss": 1.043, "grad_norm": 1.1079537868499756, "learning_rate": 0.0002, "epoch": 5.543107647345348, "step": 5690}, {"loss": 1.1107, "grad_norm": 1.4201879501342773, "learning_rate": 0.0002, "epoch": 5.552849488553337, "step": 5700}, {"loss": 1.0697, "grad_norm": 1.2092000246047974, "learning_rate": 0.0002, "epoch": 5.562591329761325, "step": 5710}, {"loss": 0.9868, "grad_norm": 1.4515851736068726, "learning_rate": 0.0002, "epoch": 5.572333170969313, "step": 5720}, {"loss": 1.1547, "grad_norm": 1.3260412216186523, "learning_rate": 0.0002, "epoch": 5.5820750121773015, "step": 5730}, {"loss": 1.1388, "grad_norm": 1.248191475868225, "learning_rate": 0.0002, "epoch": 5.59181685338529, "step": 5740}, {"loss": 1.0597, "grad_norm": 1.2037307024002075, "learning_rate": 0.0002, "epoch": 5.601558694593278, "step": 5750}, {"loss": 1.1425, "grad_norm": 1.341237187385559, "learning_rate": 0.0002, "epoch": 5.611300535801266, "step": 5760}, {"loss": 1.0942, "grad_norm": 1.130115270614624, "learning_rate": 0.0002, "epoch": 5.621042377009255, "step": 5770}, {"loss": 1.1029, "grad_norm": 1.3834772109985352, "learning_rate": 0.0002, "epoch": 5.630784218217243, "step": 5780}, {"loss": 1.0825, "grad_norm": 1.2586270570755005, "learning_rate": 0.0002, "epoch": 5.640526059425231, "step": 5790}, {"loss": 1.0186, "grad_norm": 1.3233023881912231, "learning_rate": 0.0002, "epoch": 5.65026790063322, "step": 5800}, {"loss": 1.0557, "grad_norm": 1.2711341381072998, "learning_rate": 0.0002, "epoch": 5.660009741841208, "step": 5810}, {"loss": 1.0897, "grad_norm": 1.3867720365524292, "learning_rate": 0.0002, "epoch": 5.669751583049196, "step": 5820}, {"loss": 1.0776, "grad_norm": 1.4783269166946411, "learning_rate": 0.0002, "epoch": 5.679493424257185, "step": 5830}, {"loss": 1.0632, "grad_norm": 1.2744768857955933, "learning_rate": 0.0002, "epoch": 5.6892352654651726, "step": 5840}, {"loss": 1.1484, "grad_norm": 1.3405882120132446, "learning_rate": 0.0002, "epoch": 5.698977106673161, "step": 5850}, {"loss": 1.0975, "grad_norm": 1.204300880432129, "learning_rate": 0.0002, "epoch": 5.70871894788115, "step": 5860}, {"loss": 1.0494, "grad_norm": 1.2954572439193726, "learning_rate": 0.0002, "epoch": 5.7184607890891375, "step": 5870}, {"loss": 1.0643, "grad_norm": 1.5478382110595703, "learning_rate": 0.0002, "epoch": 5.728202630297126, "step": 5880}, {"loss": 1.0582, "grad_norm": 1.2095842361450195, "learning_rate": 0.0002, "epoch": 5.737944471505115, "step": 5890}, {"loss": 1.1, "grad_norm": 1.0691519975662231, "learning_rate": 0.0002, "epoch": 5.747686312713103, "step": 5900}, {"loss": 1.0906, "grad_norm": 1.1920677423477173, "learning_rate": 0.0002, "epoch": 5.757428153921091, "step": 5910}, {"loss": 1.1746, "grad_norm": 1.2051277160644531, "learning_rate": 0.0002, "epoch": 5.76716999512908, "step": 5920}, {"loss": 1.1221, "grad_norm": 1.197490930557251, "learning_rate": 0.0002, "epoch": 5.776911836337067, "step": 5930}, {"loss": 1.07, "grad_norm": 1.2003998756408691, "learning_rate": 0.0002, "epoch": 5.786653677545056, "step": 5940}, {"loss": 1.0938, "grad_norm": 1.2323646545410156, "learning_rate": 0.0002, "epoch": 5.7963955187530445, "step": 5950}, {"loss": 1.1443, "grad_norm": 1.2593932151794434, "learning_rate": 0.0002, "epoch": 5.806137359961033, "step": 5960}, {"loss": 1.0829, "grad_norm": 1.1835976839065552, "learning_rate": 0.0002, "epoch": 5.815879201169021, "step": 5970}, {"loss": 1.1056, "grad_norm": 1.4770104885101318, "learning_rate": 0.0002, "epoch": 5.825621042377009, "step": 5980}, {"loss": 1.1934, "grad_norm": 1.1025809049606323, "learning_rate": 0.0002, "epoch": 5.835362883584997, "step": 5990}, {"loss": 1.1323, "grad_norm": 1.364588975906372, "learning_rate": 0.0002, "epoch": 5.845104724792986, "step": 6000}, {"loss": 1.1234, "grad_norm": 1.2340112924575806, "learning_rate": 0.0002, "epoch": 5.854846566000974, "step": 6010}, {"loss": 1.1123, "grad_norm": 1.4925711154937744, "learning_rate": 0.0002, "epoch": 5.864588407208963, "step": 6020}, {"loss": 1.12, "grad_norm": 1.3516744375228882, "learning_rate": 0.0002, "epoch": 5.874330248416951, "step": 6030}, {"loss": 1.1399, "grad_norm": 1.2058138847351074, "learning_rate": 0.0002, "epoch": 5.884072089624939, "step": 6040}, {"loss": 1.1074, "grad_norm": 1.13870108127594, "learning_rate": 0.0002, "epoch": 5.893813930832927, "step": 6050}, {"loss": 1.088, "grad_norm": 1.1587319374084473, "learning_rate": 0.0002, "epoch": 5.9035557720409155, "step": 6060}, {"loss": 1.1376, "grad_norm": 1.164481520652771, "learning_rate": 0.0002, "epoch": 5.913297613248904, "step": 6070}, {"loss": 1.1262, "grad_norm": 1.2115206718444824, "learning_rate": 0.0002, "epoch": 5.923039454456893, "step": 6080}, {"loss": 1.1345, "grad_norm": 1.3201590776443481, "learning_rate": 0.0002, "epoch": 5.93278129566488, "step": 6090}, {"loss": 1.1288, "grad_norm": 1.287380576133728, "learning_rate": 0.0002, "epoch": 5.942523136872869, "step": 6100}, {"loss": 1.1475, "grad_norm": 1.1820166110992432, "learning_rate": 0.0002, "epoch": 5.952264978080858, "step": 6110}, {"loss": 1.1112, "grad_norm": 1.2550667524337769, "learning_rate": 0.0002, "epoch": 5.962006819288845, "step": 6120}, {"loss": 1.1528, "grad_norm": 1.3547813892364502, "learning_rate": 0.0002, "epoch": 5.971748660496834, "step": 6130}, {"loss": 1.0557, "grad_norm": 1.260842204093933, "learning_rate": 0.0002, "epoch": 5.9814905017048225, "step": 6140}, {"loss": 1.1119, "grad_norm": 1.1643036603927612, "learning_rate": 0.0002, "epoch": 5.99123234291281, "step": 6150}, {"eval_loss": 2.2628161907196045, "eval_runtime": 57.2379, "eval_samples_per_second": 8.858, "eval_steps_per_second": 1.118, "epoch": 6.0, "step": 6159}, {"loss": 1.0837, "grad_norm": 0.9384723901748657, "learning_rate": 0.0002, "epoch": 6.000974184120799, "step": 6160}, {"loss": 0.7335, "grad_norm": 2.1525821685791016, "learning_rate": 0.0002, "epoch": 6.0107160253287875, "step": 6170}, {"loss": 0.8416, "grad_norm": 2.0194077491760254, "learning_rate": 0.0002, "epoch": 6.020457866536775, "step": 6180}, {"loss": 0.8443, "grad_norm": 1.5257816314697266, "learning_rate": 0.0002, "epoch": 6.030199707744764, "step": 6190}, {"loss": 0.7543, "grad_norm": 1.5432662963867188, "learning_rate": 0.0002, "epoch": 6.039941548952752, "step": 6200}, {"loss": 0.8104, "grad_norm": 1.6874405145645142, "learning_rate": 0.0002, "epoch": 6.04968339016074, "step": 6210}, {"loss": 0.8395, "grad_norm": 1.7346407175064087, "learning_rate": 0.0002, "epoch": 6.059425231368729, "step": 6220}, {"loss": 0.8027, "grad_norm": 1.5320781469345093, "learning_rate": 0.0002, "epoch": 6.069167072576717, "step": 6230}, {"loss": 0.7488, "grad_norm": 1.4106669425964355, "learning_rate": 0.0002, "epoch": 6.078908913784705, "step": 6240}, {"loss": 0.812, "grad_norm": 1.5568628311157227, "learning_rate": 0.0002, "epoch": 6.088650754992694, "step": 6250}, {"loss": 0.8055, "grad_norm": 1.6155978441238403, "learning_rate": 0.0002, "epoch": 6.098392596200682, "step": 6260}, {"loss": 0.8225, "grad_norm": 1.4820445775985718, "learning_rate": 0.0002, "epoch": 6.10813443740867, "step": 6270}, {"loss": 0.8599, "grad_norm": 1.6163820028305054, "learning_rate": 0.0002, "epoch": 6.1178762786166585, "step": 6280}, {"loss": 0.853, "grad_norm": 1.8396387100219727, "learning_rate": 0.0002, "epoch": 6.127618119824647, "step": 6290}, {"loss": 0.7768, "grad_norm": 1.7181230783462524, "learning_rate": 0.0002, "epoch": 6.137359961032635, "step": 6300}, {"loss": 0.8116, "grad_norm": 1.6568509340286255, "learning_rate": 0.0002, "epoch": 6.147101802240623, "step": 6310}, {"loss": 0.8525, "grad_norm": 1.3481947183609009, "learning_rate": 0.0002, "epoch": 6.156843643448612, "step": 6320}, {"loss": 0.762, "grad_norm": 1.5788342952728271, "learning_rate": 0.0002, "epoch": 6.1665854846566, "step": 6330}, {"loss": 0.886, "grad_norm": 1.5067620277404785, "learning_rate": 0.0002, "epoch": 6.176327325864588, "step": 6340}, {"loss": 0.8375, "grad_norm": 1.8198208808898926, "learning_rate": 0.0002, "epoch": 6.186069167072577, "step": 6350}, {"loss": 0.7867, "grad_norm": 1.4012749195098877, "learning_rate": 0.0002, "epoch": 6.195811008280565, "step": 6360}, {"loss": 0.8144, "grad_norm": 1.759798288345337, "learning_rate": 0.0002, "epoch": 6.205552849488553, "step": 6370}, {"loss": 0.7811, "grad_norm": 1.468922734260559, "learning_rate": 0.0002, "epoch": 6.215294690696542, "step": 6380}, {"loss": 0.8356, "grad_norm": 1.3706471920013428, "learning_rate": 0.0002, "epoch": 6.2250365319045295, "step": 6390}, {"loss": 0.8096, "grad_norm": 1.6397383213043213, "learning_rate": 0.0002, "epoch": 6.234778373112518, "step": 6400}, {"loss": 0.8834, "grad_norm": 1.5614187717437744, "learning_rate": 0.0002, "epoch": 6.244520214320507, "step": 6410}, {"loss": 0.8533, "grad_norm": 1.7118678092956543, "learning_rate": 0.0002, "epoch": 6.2542620555284945, "step": 6420}, {"loss": 0.8653, "grad_norm": 1.4041547775268555, "learning_rate": 0.0002, "epoch": 6.264003896736483, "step": 6430}, {"loss": 0.879, "grad_norm": 1.7653605937957764, "learning_rate": 0.0002, "epoch": 6.273745737944472, "step": 6440}, {"loss": 0.8786, "grad_norm": 2.6219191551208496, "learning_rate": 0.0002, "epoch": 6.28348757915246, "step": 6450}, {"loss": 0.8896, "grad_norm": 1.4757837057113647, "learning_rate": 0.0002, "epoch": 6.293229420360448, "step": 6460}, {"loss": 0.9079, "grad_norm": 1.715598225593567, "learning_rate": 0.0002, "epoch": 6.302971261568437, "step": 6470}, {"loss": 0.8526, "grad_norm": 1.376216173171997, "learning_rate": 0.0002, "epoch": 6.312713102776424, "step": 6480}, {"loss": 0.8742, "grad_norm": 1.7119828462600708, "learning_rate": 0.0002, "epoch": 6.322454943984413, "step": 6490}, {"loss": 0.7988, "grad_norm": 1.4304355382919312, "learning_rate": 0.0002, "epoch": 6.3321967851924015, "step": 6500}, {"loss": 0.8539, "grad_norm": 1.4889872074127197, "learning_rate": 0.0002, "epoch": 6.34193862640039, "step": 6510}, {"loss": 0.9328, "grad_norm": 1.370373010635376, "learning_rate": 0.0002, "epoch": 6.351680467608378, "step": 6520}, {"loss": 0.8997, "grad_norm": 1.7697709798812866, "learning_rate": 0.0002, "epoch": 6.361422308816366, "step": 6530}, {"loss": 0.9421, "grad_norm": 1.495297908782959, "learning_rate": 0.0002, "epoch": 6.371164150024355, "step": 6540}, {"loss": 0.8796, "grad_norm": 1.7251347303390503, "learning_rate": 0.0002, "epoch": 6.380905991232343, "step": 6550}, {"loss": 0.9327, "grad_norm": 1.6909505128860474, "learning_rate": 0.0002, "epoch": 6.390647832440331, "step": 6560}, {"loss": 0.837, "grad_norm": 1.4369314908981323, "learning_rate": 0.0002, "epoch": 6.40038967364832, "step": 6570}, {"loss": 0.8572, "grad_norm": 1.7803739309310913, "learning_rate": 0.0002, "epoch": 6.410131514856308, "step": 6580}, {"loss": 0.9024, "grad_norm": 1.6107097864151, "learning_rate": 0.0002, "epoch": 6.419873356064296, "step": 6590}, {"loss": 0.8469, "grad_norm": 1.6151643991470337, "learning_rate": 0.0002, "epoch": 6.429615197272285, "step": 6600}, {"loss": 0.8791, "grad_norm": 1.7159833908081055, "learning_rate": 0.0002, "epoch": 6.4393570384802725, "step": 6610}, {"loss": 0.9249, "grad_norm": 1.4366064071655273, "learning_rate": 0.0002, "epoch": 6.449098879688261, "step": 6620}, {"loss": 0.8417, "grad_norm": 1.6050453186035156, "learning_rate": 0.0002, "epoch": 6.45884072089625, "step": 6630}, {"loss": 0.8943, "grad_norm": 1.6296740770339966, "learning_rate": 0.0002, "epoch": 6.468582562104237, "step": 6640}, {"loss": 0.9228, "grad_norm": 1.6181174516677856, "learning_rate": 0.0002, "epoch": 6.478324403312226, "step": 6650}, {"loss": 0.9139, "grad_norm": 1.5452176332473755, "learning_rate": 0.0002, "epoch": 6.488066244520215, "step": 6660}, {"loss": 0.9022, "grad_norm": 1.3919731378555298, "learning_rate": 0.0002, "epoch": 6.497808085728202, "step": 6670}, {"loss": 0.9046, "grad_norm": 1.6456257104873657, "learning_rate": 0.0002, "epoch": 6.507549926936191, "step": 6680}, {"loss": 0.9041, "grad_norm": 1.4147369861602783, "learning_rate": 0.0002, "epoch": 6.5172917681441795, "step": 6690}, {"loss": 0.8361, "grad_norm": 1.7005025148391724, "learning_rate": 0.0002, "epoch": 6.527033609352167, "step": 6700}, {"loss": 0.8738, "grad_norm": 1.6032357215881348, "learning_rate": 0.0002, "epoch": 6.536775450560156, "step": 6710}, {"loss": 0.9796, "grad_norm": 1.3454229831695557, "learning_rate": 0.0002, "epoch": 6.5465172917681445, "step": 6720}, {"loss": 0.8573, "grad_norm": 1.6961418390274048, "learning_rate": 0.0002, "epoch": 6.556259132976132, "step": 6730}, {"loss": 0.9241, "grad_norm": 1.78407883644104, "learning_rate": 0.0002, "epoch": 6.566000974184121, "step": 6740}, {"loss": 0.8941, "grad_norm": 1.6817889213562012, "learning_rate": 0.0002, "epoch": 6.575742815392109, "step": 6750}, {"loss": 0.8765, "grad_norm": 1.7894943952560425, "learning_rate": 0.0002, "epoch": 6.585484656600097, "step": 6760}, {"loss": 0.8607, "grad_norm": 1.6404837369918823, "learning_rate": 0.0002, "epoch": 6.595226497808086, "step": 6770}, {"loss": 0.8573, "grad_norm": 1.5849255323410034, "learning_rate": 0.0002, "epoch": 6.604968339016074, "step": 6780}, {"loss": 0.9575, "grad_norm": 1.5993813276290894, "learning_rate": 0.0002, "epoch": 6.614710180224062, "step": 6790}, {"loss": 0.8922, "grad_norm": 1.2834863662719727, "learning_rate": 0.0002, "epoch": 6.624452021432051, "step": 6800}, {"loss": 0.9007, "grad_norm": 1.7215641736984253, "learning_rate": 0.0002, "epoch": 6.634193862640039, "step": 6810}, {"loss": 0.9292, "grad_norm": 1.7588146924972534, "learning_rate": 0.0002, "epoch": 6.643935703848027, "step": 6820}, {"loss": 0.8634, "grad_norm": 1.7956023216247559, "learning_rate": 0.0002, "epoch": 6.6536775450560155, "step": 6830}, {"loss": 0.8108, "grad_norm": 1.5115351676940918, "learning_rate": 0.0002, "epoch": 6.663419386264004, "step": 6840}, {"loss": 0.9329, "grad_norm": 1.5660319328308105, "learning_rate": 0.0002, "epoch": 6.673161227471992, "step": 6850}, {"loss": 0.9877, "grad_norm": 1.4323679208755493, "learning_rate": 0.0002, "epoch": 6.68290306867998, "step": 6860}, {"loss": 0.8732, "grad_norm": 1.662089467048645, "learning_rate": 0.0002, "epoch": 6.692644909887969, "step": 6870}, {"loss": 0.87, "grad_norm": 1.7854869365692139, "learning_rate": 0.0002, "epoch": 6.702386751095958, "step": 6880}, {"loss": 0.9105, "grad_norm": 1.5491222143173218, "learning_rate": 0.0002, "epoch": 6.712128592303945, "step": 6890}, {"loss": 0.9147, "grad_norm": 1.5946987867355347, "learning_rate": 0.0002, "epoch": 6.721870433511934, "step": 6900}, {"loss": 0.9391, "grad_norm": 1.6195964813232422, "learning_rate": 0.0002, "epoch": 6.731612274719922, "step": 6910}, {"loss": 0.8947, "grad_norm": 1.6366901397705078, "learning_rate": 0.0002, "epoch": 6.74135411592791, "step": 6920}, {"loss": 0.8695, "grad_norm": 1.5080382823944092, "learning_rate": 0.0002, "epoch": 6.751095957135899, "step": 6930}, {"loss": 0.9124, "grad_norm": 1.742353916168213, "learning_rate": 0.0002, "epoch": 6.760837798343887, "step": 6940}, {"loss": 0.9118, "grad_norm": 1.690251111984253, "learning_rate": 0.0002, "epoch": 6.770579639551875, "step": 6950}, {"loss": 0.9039, "grad_norm": 1.7103357315063477, "learning_rate": 0.0002, "epoch": 6.780321480759864, "step": 6960}, {"loss": 0.869, "grad_norm": 1.6630914211273193, "learning_rate": 0.0002, "epoch": 6.7900633219678515, "step": 6970}, {"loss": 0.8944, "grad_norm": 1.423768162727356, "learning_rate": 0.0002, "epoch": 6.79980516317584, "step": 6980}, {"loss": 0.9397, "grad_norm": 1.7844693660736084, "learning_rate": 0.0002, "epoch": 6.809547004383829, "step": 6990}, {"loss": 0.8889, "grad_norm": 1.545282006263733, "learning_rate": 0.0002, "epoch": 6.819288845591817, "step": 7000}, {"loss": 0.9333, "grad_norm": 1.4340319633483887, "learning_rate": 0.0002, "epoch": 6.829030686799805, "step": 7010}, {"loss": 0.9486, "grad_norm": 1.5981626510620117, "learning_rate": 0.0002, "epoch": 6.838772528007794, "step": 7020}, {"loss": 0.9062, "grad_norm": 1.5205026865005493, "learning_rate": 0.0002, "epoch": 6.848514369215782, "step": 7030}, {"loss": 0.9245, "grad_norm": 1.6999989748001099, "learning_rate": 0.0002, "epoch": 6.85825621042377, "step": 7040}, {"loss": 0.9313, "grad_norm": 1.6392347812652588, "learning_rate": 0.0002, "epoch": 6.8679980516317585, "step": 7050}, {"loss": 0.9275, "grad_norm": 1.637308955192566, "learning_rate": 0.0002, "epoch": 6.877739892839747, "step": 7060}, {"loss": 0.9672, "grad_norm": 1.671341896057129, "learning_rate": 0.0002, "epoch": 6.887481734047735, "step": 7070}, {"loss": 0.9726, "grad_norm": 1.4437555074691772, "learning_rate": 0.0002, "epoch": 6.897223575255723, "step": 7080}, {"loss": 0.9454, "grad_norm": 1.4251935482025146, "learning_rate": 0.0002, "epoch": 6.906965416463712, "step": 7090}, {"loss": 0.8858, "grad_norm": 1.5106734037399292, "learning_rate": 0.0002, "epoch": 6.9167072576717, "step": 7100}, {"loss": 0.939, "grad_norm": 1.670742154121399, "learning_rate": 0.0002, "epoch": 6.926449098879688, "step": 7110}, {"loss": 0.8818, "grad_norm": 1.4353723526000977, "learning_rate": 0.0002, "epoch": 6.936190940087677, "step": 7120}, {"loss": 0.9354, "grad_norm": 1.9437772035598755, "learning_rate": 0.0002, "epoch": 6.945932781295665, "step": 7130}, {"loss": 0.9623, "grad_norm": 1.4922038316726685, "learning_rate": 0.0002, "epoch": 6.955674622503653, "step": 7140}, {"loss": 0.9653, "grad_norm": 1.489193081855774, "learning_rate": 0.0002, "epoch": 6.965416463711642, "step": 7150}, {"loss": 1.0024, "grad_norm": 1.529490351676941, "learning_rate": 0.0002, "epoch": 6.9751583049196295, "step": 7160}, {"loss": 0.9715, "grad_norm": 1.7370105981826782, "learning_rate": 0.0002, "epoch": 6.984900146127618, "step": 7170}, {"loss": 0.921, "grad_norm": 1.5639604330062866, "learning_rate": 0.0002, "epoch": 6.994641987335607, "step": 7180}]} +{"epoch": 7.996103263516805, "step": 8208, "epoch_duration": 1626.2399775981903, "total_accumulated_duration": 14136.657781124115, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-4/checkpoint-1026", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0782, "grad_norm": 0.6537588834762573, "learning_rate": 0.0002, "epoch": 0.00974184120798831, "step": 10}, {"loss": 2.4644, "grad_norm": 0.5270306468009949, "learning_rate": 0.0002, "epoch": 0.01948368241597662, "step": 20}, {"loss": 2.2589, "grad_norm": 0.6826501488685608, "learning_rate": 0.0002, "epoch": 0.029225523623964928, "step": 30}, {"loss": 2.0141, "grad_norm": 0.5061377286911011, "learning_rate": 0.0002, "epoch": 0.03896736483195324, "step": 40}, {"loss": 1.9458, "grad_norm": 0.4300410747528076, "learning_rate": 0.0002, "epoch": 0.04870920603994155, "step": 50}, {"loss": 1.983, "grad_norm": 0.5063319802284241, "learning_rate": 0.0002, "epoch": 0.058451047247929856, "step": 60}, {"loss": 1.9799, "grad_norm": 0.49310117959976196, "learning_rate": 0.0002, "epoch": 0.06819288845591817, "step": 70}, {"loss": 1.9277, "grad_norm": 0.4676004648208618, "learning_rate": 0.0002, "epoch": 0.07793472966390648, "step": 80}, {"loss": 1.9147, "grad_norm": 0.41647228598594666, "learning_rate": 0.0002, "epoch": 0.08767657087189479, "step": 90}, {"loss": 1.8894, "grad_norm": 0.40217313170433044, "learning_rate": 0.0002, "epoch": 0.0974184120798831, "step": 100}, {"loss": 1.9099, "grad_norm": 0.4123637080192566, "learning_rate": 0.0002, "epoch": 0.1071602532878714, "step": 110}, {"loss": 1.8471, "grad_norm": 0.37414297461509705, "learning_rate": 0.0002, "epoch": 0.11690209449585971, "step": 120}, {"loss": 1.894, "grad_norm": 0.3179326355457306, "learning_rate": 0.0002, "epoch": 0.12664393570384802, "step": 130}, {"loss": 1.847, "grad_norm": 0.3548192083835602, "learning_rate": 0.0002, "epoch": 0.13638577691183634, "step": 140}, {"loss": 1.7919, "grad_norm": 0.3273540139198303, "learning_rate": 0.0002, "epoch": 0.14612761811982464, "step": 150}, {"loss": 1.8496, "grad_norm": 0.36500975489616394, "learning_rate": 0.0002, "epoch": 0.15586945932781296, "step": 160}, {"loss": 1.8473, "grad_norm": 0.4106619656085968, "learning_rate": 0.0002, "epoch": 0.16561130053580125, "step": 170}, {"loss": 1.841, "grad_norm": 0.41361644864082336, "learning_rate": 0.0002, "epoch": 0.17535314174378958, "step": 180}, {"loss": 1.8879, "grad_norm": 0.3608580231666565, "learning_rate": 0.0002, "epoch": 0.1850949829517779, "step": 190}, {"loss": 1.7717, "grad_norm": 0.4291760325431824, "learning_rate": 0.0002, "epoch": 0.1948368241597662, "step": 200}, {"loss": 1.8437, "grad_norm": 0.344184011220932, "learning_rate": 0.0002, "epoch": 0.20457866536775451, "step": 210}, {"loss": 1.8779, "grad_norm": 0.3834705650806427, "learning_rate": 0.0002, "epoch": 0.2143205065757428, "step": 220}, {"loss": 1.7533, "grad_norm": 0.3738210201263428, "learning_rate": 0.0002, "epoch": 0.22406234778373113, "step": 230}, {"loss": 1.824, "grad_norm": 0.4306780695915222, "learning_rate": 0.0002, "epoch": 0.23380418899171942, "step": 240}, {"loss": 1.8519, "grad_norm": 0.5066465139389038, "learning_rate": 0.0002, "epoch": 0.24354603019970775, "step": 250}, {"loss": 1.7402, "grad_norm": 0.34227681159973145, "learning_rate": 0.0002, "epoch": 0.25328787140769604, "step": 260}, {"loss": 1.8614, "grad_norm": 0.3346865475177765, "learning_rate": 0.0002, "epoch": 0.26302971261568436, "step": 270}, {"loss": 1.8502, "grad_norm": 0.3639362156391144, "learning_rate": 0.0002, "epoch": 0.2727715538236727, "step": 280}, {"loss": 1.8428, "grad_norm": 0.33223700523376465, "learning_rate": 0.0002, "epoch": 0.282513395031661, "step": 290}, {"loss": 1.821, "grad_norm": 0.35176315903663635, "learning_rate": 0.0002, "epoch": 0.2922552362396493, "step": 300}, {"loss": 1.7635, "grad_norm": 0.3581472635269165, "learning_rate": 0.0002, "epoch": 0.3019970774476376, "step": 310}, {"loss": 1.8262, "grad_norm": 0.35943421721458435, "learning_rate": 0.0002, "epoch": 0.3117389186556259, "step": 320}, {"loss": 1.8167, "grad_norm": 0.322051078081131, "learning_rate": 0.0002, "epoch": 0.32148075986361424, "step": 330}, {"loss": 1.8221, "grad_norm": 0.33904823660850525, "learning_rate": 0.0002, "epoch": 0.3312226010716025, "step": 340}, {"loss": 1.9159, "grad_norm": 0.39162731170654297, "learning_rate": 0.0002, "epoch": 0.34096444227959083, "step": 350}, {"loss": 1.7966, "grad_norm": 0.330624520778656, "learning_rate": 0.0002, "epoch": 0.35070628348757915, "step": 360}, {"loss": 1.8646, "grad_norm": 0.3793248236179352, "learning_rate": 0.0002, "epoch": 0.3604481246955675, "step": 370}, {"loss": 1.8038, "grad_norm": 0.3347395658493042, "learning_rate": 0.0002, "epoch": 0.3701899659035558, "step": 380}, {"loss": 1.9244, "grad_norm": 0.30527254939079285, "learning_rate": 0.0002, "epoch": 0.37993180711154406, "step": 390}, {"loss": 1.7982, "grad_norm": 0.3081390857696533, "learning_rate": 0.0002, "epoch": 0.3896736483195324, "step": 400}, {"loss": 1.8968, "grad_norm": 0.3742620050907135, "learning_rate": 0.0002, "epoch": 0.3994154895275207, "step": 410}, {"loss": 1.8095, "grad_norm": 0.4080568253993988, "learning_rate": 0.0002, "epoch": 0.40915733073550903, "step": 420}, {"loss": 1.8555, "grad_norm": 0.38034746050834656, "learning_rate": 0.0002, "epoch": 0.4188991719434973, "step": 430}, {"loss": 1.8494, "grad_norm": 0.34893402457237244, "learning_rate": 0.0002, "epoch": 0.4286410131514856, "step": 440}, {"loss": 1.8481, "grad_norm": 0.33285608887672424, "learning_rate": 0.0002, "epoch": 0.43838285435947394, "step": 450}, {"loss": 1.8466, "grad_norm": 0.4110095798969269, "learning_rate": 0.0002, "epoch": 0.44812469556746226, "step": 460}, {"loss": 1.7906, "grad_norm": 0.3658817410469055, "learning_rate": 0.0002, "epoch": 0.4578665367754506, "step": 470}, {"loss": 1.7589, "grad_norm": 0.31350770592689514, "learning_rate": 0.0002, "epoch": 0.46760837798343885, "step": 480}, {"loss": 1.7839, "grad_norm": 0.38827991485595703, "learning_rate": 0.0002, "epoch": 0.47735021919142717, "step": 490}, {"loss": 1.8224, "grad_norm": 0.3792393207550049, "learning_rate": 0.0002, "epoch": 0.4870920603994155, "step": 500}, {"loss": 1.8028, "grad_norm": 0.3004095256328583, "learning_rate": 0.0002, "epoch": 0.4968339016074038, "step": 510}, {"loss": 1.6899, "grad_norm": 0.3200063407421112, "learning_rate": 0.0002, "epoch": 0.5065757428153921, "step": 520}, {"loss": 1.8205, "grad_norm": 0.3206128478050232, "learning_rate": 0.0002, "epoch": 0.5163175840233805, "step": 530}, {"loss": 1.7725, "grad_norm": 0.30258631706237793, "learning_rate": 0.0002, "epoch": 0.5260594252313687, "step": 540}, {"loss": 1.7791, "grad_norm": 0.28210392594337463, "learning_rate": 0.0002, "epoch": 0.535801266439357, "step": 550}, {"loss": 1.8221, "grad_norm": 0.34854066371917725, "learning_rate": 0.0002, "epoch": 0.5455431076473454, "step": 560}, {"loss": 1.8331, "grad_norm": 0.31689873337745667, "learning_rate": 0.0002, "epoch": 0.5552849488553336, "step": 570}, {"loss": 1.8311, "grad_norm": 0.31253790855407715, "learning_rate": 0.0002, "epoch": 0.565026790063322, "step": 580}, {"loss": 1.7035, "grad_norm": 0.3229721188545227, "learning_rate": 0.0002, "epoch": 0.5747686312713103, "step": 590}, {"loss": 1.766, "grad_norm": 0.3723772466182709, "learning_rate": 0.0002, "epoch": 0.5845104724792985, "step": 600}, {"loss": 1.8357, "grad_norm": 0.345798522233963, "learning_rate": 0.0002, "epoch": 0.5942523136872869, "step": 610}, {"loss": 1.7266, "grad_norm": 0.3440598249435425, "learning_rate": 0.0002, "epoch": 0.6039941548952752, "step": 620}, {"loss": 1.8307, "grad_norm": 0.3406416177749634, "learning_rate": 0.0002, "epoch": 0.6137359961032636, "step": 630}, {"loss": 1.8103, "grad_norm": 0.3218357264995575, "learning_rate": 0.0002, "epoch": 0.6234778373112518, "step": 640}, {"loss": 1.7457, "grad_norm": 0.45319172739982605, "learning_rate": 0.0002, "epoch": 0.6332196785192401, "step": 650}, {"loss": 1.8674, "grad_norm": 0.2787110507488251, "learning_rate": 0.0002, "epoch": 0.6429615197272285, "step": 660}, {"loss": 1.8426, "grad_norm": 0.3064707815647125, "learning_rate": 0.0002, "epoch": 0.6527033609352167, "step": 670}, {"loss": 1.846, "grad_norm": 0.2940629720687866, "learning_rate": 0.0002, "epoch": 0.662445202143205, "step": 680}, {"loss": 1.7865, "grad_norm": 0.31695225834846497, "learning_rate": 0.0002, "epoch": 0.6721870433511934, "step": 690}, {"loss": 1.8449, "grad_norm": 0.29589611291885376, "learning_rate": 0.0002, "epoch": 0.6819288845591817, "step": 700}, {"loss": 1.7828, "grad_norm": 0.3062121570110321, "learning_rate": 0.0002, "epoch": 0.69167072576717, "step": 710}, {"loss": 1.8108, "grad_norm": 0.3315656781196594, "learning_rate": 0.0002, "epoch": 0.7014125669751583, "step": 720}, {"loss": 1.8056, "grad_norm": 0.30353930592536926, "learning_rate": 0.0002, "epoch": 0.7111544081831466, "step": 730}, {"loss": 1.7682, "grad_norm": 0.28360483050346375, "learning_rate": 0.0002, "epoch": 0.720896249391135, "step": 740}, {"loss": 1.7386, "grad_norm": 0.3362562656402588, "learning_rate": 0.0002, "epoch": 0.7306380905991232, "step": 750}, {"loss": 1.7607, "grad_norm": 0.40434667468070984, "learning_rate": 0.0002, "epoch": 0.7403799318071116, "step": 760}, {"loss": 1.8374, "grad_norm": 0.2930425703525543, "learning_rate": 0.0002, "epoch": 0.7501217730150999, "step": 770}, {"loss": 1.8216, "grad_norm": 0.30177003145217896, "learning_rate": 0.0002, "epoch": 0.7598636142230881, "step": 780}, {"loss": 1.8081, "grad_norm": 0.2784474790096283, "learning_rate": 0.0002, "epoch": 0.7696054554310765, "step": 790}, {"loss": 1.7953, "grad_norm": 0.35849854350090027, "learning_rate": 0.0002, "epoch": 0.7793472966390648, "step": 800}, {"loss": 1.7896, "grad_norm": 0.27329114079475403, "learning_rate": 0.0002, "epoch": 0.7890891378470531, "step": 810}, {"loss": 1.7633, "grad_norm": 0.33331671357154846, "learning_rate": 0.0002, "epoch": 0.7988309790550414, "step": 820}, {"loss": 1.7508, "grad_norm": 0.28727295994758606, "learning_rate": 0.0002, "epoch": 0.8085728202630297, "step": 830}, {"loss": 1.7541, "grad_norm": 0.31391268968582153, "learning_rate": 0.0002, "epoch": 0.8183146614710181, "step": 840}, {"loss": 1.793, "grad_norm": 0.3303709030151367, "learning_rate": 0.0002, "epoch": 0.8280565026790063, "step": 850}, {"loss": 1.7479, "grad_norm": 0.33772537112236023, "learning_rate": 0.0002, "epoch": 0.8377983438869946, "step": 860}, {"loss": 1.7138, "grad_norm": 0.32876333594322205, "learning_rate": 0.0002, "epoch": 0.847540185094983, "step": 870}, {"loss": 1.7832, "grad_norm": 0.28444716334342957, "learning_rate": 0.0002, "epoch": 0.8572820263029712, "step": 880}, {"loss": 1.7077, "grad_norm": 0.3070019483566284, "learning_rate": 0.0002, "epoch": 0.8670238675109596, "step": 890}, {"loss": 1.7699, "grad_norm": 0.29484760761260986, "learning_rate": 0.0002, "epoch": 0.8767657087189479, "step": 900}, {"loss": 1.7211, "grad_norm": 0.32373034954071045, "learning_rate": 0.0002, "epoch": 0.8865075499269361, "step": 910}, {"loss": 1.7799, "grad_norm": 0.3229396939277649, "learning_rate": 0.0002, "epoch": 0.8962493911349245, "step": 920}, {"loss": 1.8226, "grad_norm": 0.33151453733444214, "learning_rate": 0.0002, "epoch": 0.9059912323429128, "step": 930}, {"loss": 1.8339, "grad_norm": 0.32037460803985596, "learning_rate": 0.0002, "epoch": 0.9157330735509012, "step": 940}, {"loss": 1.822, "grad_norm": 0.31283533573150635, "learning_rate": 0.0002, "epoch": 0.9254749147588894, "step": 950}, {"loss": 1.8233, "grad_norm": 0.27984118461608887, "learning_rate": 0.0002, "epoch": 0.9352167559668777, "step": 960}, {"loss": 1.7755, "grad_norm": 0.316500186920166, "learning_rate": 0.0002, "epoch": 0.9449585971748661, "step": 970}, {"loss": 1.8032, "grad_norm": 0.33708682656288147, "learning_rate": 0.0002, "epoch": 0.9547004383828543, "step": 980}, {"loss": 1.8863, "grad_norm": 0.31026017665863037, "learning_rate": 0.0002, "epoch": 0.9644422795908427, "step": 990}, {"loss": 1.8458, "grad_norm": 0.30874672532081604, "learning_rate": 0.0002, "epoch": 0.974184120798831, "step": 1000}, {"loss": 1.7975, "grad_norm": 0.3257741630077362, "learning_rate": 0.0002, "epoch": 0.9839259620068193, "step": 1010}, {"loss": 1.7936, "grad_norm": 0.2865653932094574, "learning_rate": 0.0002, "epoch": 0.9936678032148076, "step": 1020}, {"eval_loss": 1.8103164434432983, "eval_runtime": 56.3917, "eval_samples_per_second": 8.991, "eval_steps_per_second": 1.135, "epoch": 0.9995129079396006, "step": 1026}, {"loss": 1.7013, "grad_norm": 0.2860608398914337, "learning_rate": 0.0002, "epoch": 1.003409644422796, "step": 1030}, {"loss": 1.7521, "grad_norm": 0.3156210780143738, "learning_rate": 0.0002, "epoch": 1.0131514856307842, "step": 1040}, {"loss": 1.6182, "grad_norm": 0.26126575469970703, "learning_rate": 0.0002, "epoch": 1.0228933268387725, "step": 1050}, {"loss": 1.7546, "grad_norm": 0.3019633889198303, "learning_rate": 0.0002, "epoch": 1.032635168046761, "step": 1060}, {"loss": 1.7096, "grad_norm": 0.534140944480896, "learning_rate": 0.0002, "epoch": 1.042377009254749, "step": 1070}, {"loss": 1.664, "grad_norm": 0.311872661113739, "learning_rate": 0.0002, "epoch": 1.0521188504627375, "step": 1080}, {"loss": 1.7056, "grad_norm": 0.3276001513004303, "learning_rate": 0.0002, "epoch": 1.0618606916707258, "step": 1090}, {"loss": 1.7943, "grad_norm": 0.35227468609809875, "learning_rate": 0.0002, "epoch": 1.071602532878714, "step": 1100}, {"loss": 1.7079, "grad_norm": 0.3597564995288849, "learning_rate": 0.0002, "epoch": 1.0813443740867024, "step": 1110}, {"loss": 1.7635, "grad_norm": 0.3547225296497345, "learning_rate": 0.0002, "epoch": 1.0910862152946907, "step": 1120}, {"loss": 1.6859, "grad_norm": 0.3399786353111267, "learning_rate": 0.0002, "epoch": 1.100828056502679, "step": 1130}, {"loss": 1.7478, "grad_norm": 0.3309086263179779, "learning_rate": 0.0002, "epoch": 1.1105698977106673, "step": 1140}, {"loss": 1.7372, "grad_norm": 0.39330706000328064, "learning_rate": 0.0002, "epoch": 1.1203117389186557, "step": 1150}, {"loss": 1.6236, "grad_norm": 0.3628021776676178, "learning_rate": 0.0002, "epoch": 1.130053580126644, "step": 1160}, {"loss": 1.8022, "grad_norm": 0.32995012402534485, "learning_rate": 0.0002, "epoch": 1.1397954213346322, "step": 1170}, {"loss": 1.636, "grad_norm": 0.36292821168899536, "learning_rate": 0.0002, "epoch": 1.1495372625426206, "step": 1180}, {"loss": 1.7156, "grad_norm": 0.3470092713832855, "learning_rate": 0.0002, "epoch": 1.159279103750609, "step": 1190}, {"loss": 1.7001, "grad_norm": 0.3496156334877014, "learning_rate": 0.0002, "epoch": 1.169020944958597, "step": 1200}, {"loss": 1.716, "grad_norm": 0.3442084789276123, "learning_rate": 0.0002, "epoch": 1.1787627861665855, "step": 1210}, {"loss": 1.7763, "grad_norm": 0.34983909130096436, "learning_rate": 0.0002, "epoch": 1.1885046273745739, "step": 1220}, {"loss": 1.6964, "grad_norm": 0.36505937576293945, "learning_rate": 0.0002, "epoch": 1.198246468582562, "step": 1230}, {"loss": 1.7382, "grad_norm": 0.31624770164489746, "learning_rate": 0.0002, "epoch": 1.2079883097905504, "step": 1240}, {"loss": 1.7196, "grad_norm": 0.3528020679950714, "learning_rate": 0.0002, "epoch": 1.2177301509985388, "step": 1250}, {"loss": 1.752, "grad_norm": 0.29294025897979736, "learning_rate": 0.0002, "epoch": 1.2274719922065271, "step": 1260}, {"loss": 1.719, "grad_norm": 0.35048434138298035, "learning_rate": 0.0002, "epoch": 1.2372138334145153, "step": 1270}, {"loss": 1.6806, "grad_norm": 0.35224461555480957, "learning_rate": 0.0002, "epoch": 1.2469556746225037, "step": 1280}, {"loss": 1.6836, "grad_norm": 0.4041554629802704, "learning_rate": 0.0002, "epoch": 1.256697515830492, "step": 1290}, {"loss": 1.6999, "grad_norm": 0.3447791039943695, "learning_rate": 0.0002, "epoch": 1.2664393570384802, "step": 1300}, {"loss": 1.7601, "grad_norm": 0.3315333425998688, "learning_rate": 0.0002, "epoch": 1.2761811982464686, "step": 1310}, {"loss": 1.7325, "grad_norm": 0.3587741255760193, "learning_rate": 0.0002, "epoch": 1.285923039454457, "step": 1320}, {"loss": 1.6707, "grad_norm": 0.3704394996166229, "learning_rate": 0.0002, "epoch": 1.2956648806624451, "step": 1330}, {"loss": 1.7374, "grad_norm": 0.38131803274154663, "learning_rate": 0.0002, "epoch": 1.3054067218704335, "step": 1340}, {"loss": 1.5956, "grad_norm": 0.36109617352485657, "learning_rate": 0.0002, "epoch": 1.3151485630784219, "step": 1350}, {"loss": 1.682, "grad_norm": 0.37283554673194885, "learning_rate": 0.0002, "epoch": 1.32489040428641, "step": 1360}, {"loss": 1.7351, "grad_norm": 0.31808891892433167, "learning_rate": 0.0002, "epoch": 1.3346322454943984, "step": 1370}, {"loss": 1.8368, "grad_norm": 0.3370385766029358, "learning_rate": 0.0002, "epoch": 1.3443740867023868, "step": 1380}, {"loss": 1.6922, "grad_norm": 0.3568558394908905, "learning_rate": 0.0002, "epoch": 1.354115927910375, "step": 1390}, {"loss": 1.7301, "grad_norm": 0.3537410497665405, "learning_rate": 0.0002, "epoch": 1.3638577691183633, "step": 1400}, {"loss": 1.6534, "grad_norm": 0.3536544144153595, "learning_rate": 0.0002, "epoch": 1.3735996103263517, "step": 1410}, {"loss": 1.6829, "grad_norm": 0.3772895038127899, "learning_rate": 0.0002, "epoch": 1.38334145153434, "step": 1420}, {"loss": 1.7344, "grad_norm": 0.38079720735549927, "learning_rate": 0.0002, "epoch": 1.3930832927423282, "step": 1430}, {"loss": 1.7121, "grad_norm": 0.3811109662055969, "learning_rate": 0.0002, "epoch": 1.4028251339503166, "step": 1440}, {"loss": 1.6424, "grad_norm": 0.38586318492889404, "learning_rate": 0.0002, "epoch": 1.412566975158305, "step": 1450}, {"loss": 1.7438, "grad_norm": 0.3405744135379791, "learning_rate": 0.0002, "epoch": 1.4223088163662934, "step": 1460}, {"loss": 1.7483, "grad_norm": 0.39527642726898193, "learning_rate": 0.0002, "epoch": 1.4320506575742815, "step": 1470}, {"loss": 1.7927, "grad_norm": 0.4494728744029999, "learning_rate": 0.0002, "epoch": 1.44179249878227, "step": 1480}, {"loss": 1.5993, "grad_norm": 0.34068453311920166, "learning_rate": 0.0002, "epoch": 1.4515343399902583, "step": 1490}, {"loss": 1.73, "grad_norm": 0.36169710755348206, "learning_rate": 0.0002, "epoch": 1.4612761811982464, "step": 1500}, {"loss": 1.7189, "grad_norm": 0.31519418954849243, "learning_rate": 0.0002, "epoch": 1.4710180224062348, "step": 1510}, {"loss": 1.7533, "grad_norm": 0.35117292404174805, "learning_rate": 0.0002, "epoch": 1.4807598636142232, "step": 1520}, {"loss": 1.6662, "grad_norm": 0.40951141715049744, "learning_rate": 0.0002, "epoch": 1.4905017048222113, "step": 1530}, {"loss": 1.764, "grad_norm": 0.37542906403541565, "learning_rate": 0.0002, "epoch": 1.5002435460301997, "step": 1540}, {"loss": 1.6563, "grad_norm": 0.35395753383636475, "learning_rate": 0.0002, "epoch": 1.509985387238188, "step": 1550}, {"loss": 1.6517, "grad_norm": 0.35497018694877625, "learning_rate": 0.0002, "epoch": 1.5197272284461762, "step": 1560}, {"loss": 1.697, "grad_norm": 0.3693031072616577, "learning_rate": 0.0002, "epoch": 1.5294690696541646, "step": 1570}, {"loss": 1.7193, "grad_norm": 0.34013301134109497, "learning_rate": 0.0002, "epoch": 1.539210910862153, "step": 1580}, {"loss": 1.7364, "grad_norm": 0.37312784790992737, "learning_rate": 0.0002, "epoch": 1.5489527520701412, "step": 1590}, {"loss": 1.6698, "grad_norm": 0.357496440410614, "learning_rate": 0.0002, "epoch": 1.5586945932781295, "step": 1600}, {"loss": 1.7113, "grad_norm": 0.35192370414733887, "learning_rate": 0.0002, "epoch": 1.568436434486118, "step": 1610}, {"loss": 1.7505, "grad_norm": 0.34144821763038635, "learning_rate": 0.0002, "epoch": 1.578178275694106, "step": 1620}, {"loss": 1.6353, "grad_norm": 0.3320509195327759, "learning_rate": 0.0002, "epoch": 1.5879201169020944, "step": 1630}, {"loss": 1.6781, "grad_norm": 0.34178847074508667, "learning_rate": 0.0002, "epoch": 1.5976619581100828, "step": 1640}, {"loss": 1.7318, "grad_norm": 0.36567580699920654, "learning_rate": 0.0002, "epoch": 1.607403799318071, "step": 1650}, {"loss": 1.6971, "grad_norm": 0.35599812865257263, "learning_rate": 0.0002, "epoch": 1.6171456405260596, "step": 1660}, {"loss": 1.7256, "grad_norm": 0.33765384554862976, "learning_rate": 0.0002, "epoch": 1.6268874817340477, "step": 1670}, {"loss": 1.7396, "grad_norm": 0.33142679929733276, "learning_rate": 0.0002, "epoch": 1.636629322942036, "step": 1680}, {"loss": 1.7143, "grad_norm": 0.6959079504013062, "learning_rate": 0.0002, "epoch": 1.6463711641500245, "step": 1690}, {"loss": 1.7665, "grad_norm": 0.35073819756507874, "learning_rate": 0.0002, "epoch": 1.6561130053580126, "step": 1700}, {"loss": 1.7571, "grad_norm": 0.3461478352546692, "learning_rate": 0.0002, "epoch": 1.665854846566001, "step": 1710}, {"loss": 1.6608, "grad_norm": 0.3697752058506012, "learning_rate": 0.0002, "epoch": 1.6755966877739894, "step": 1720}, {"loss": 1.729, "grad_norm": 0.3755154609680176, "learning_rate": 0.0002, "epoch": 1.6853385289819776, "step": 1730}, {"loss": 1.6618, "grad_norm": 0.33977627754211426, "learning_rate": 0.0002, "epoch": 1.695080370189966, "step": 1740}, {"loss": 1.7207, "grad_norm": 0.4001041650772095, "learning_rate": 0.0002, "epoch": 1.7048222113979543, "step": 1750}, {"loss": 1.6756, "grad_norm": 0.36998286843299866, "learning_rate": 0.0002, "epoch": 1.7145640526059425, "step": 1760}, {"loss": 1.7506, "grad_norm": 0.39944565296173096, "learning_rate": 0.0002, "epoch": 1.7243058938139308, "step": 1770}, {"loss": 1.6725, "grad_norm": 0.4002859890460968, "learning_rate": 0.0002, "epoch": 1.7340477350219192, "step": 1780}, {"loss": 1.768, "grad_norm": 0.33336859941482544, "learning_rate": 0.0002, "epoch": 1.7437895762299074, "step": 1790}, {"loss": 1.7816, "grad_norm": 0.35853952169418335, "learning_rate": 0.0002, "epoch": 1.7535314174378958, "step": 1800}, {"loss": 1.7135, "grad_norm": 0.35876700282096863, "learning_rate": 0.0002, "epoch": 1.7632732586458841, "step": 1810}, {"loss": 1.7327, "grad_norm": 0.3497968912124634, "learning_rate": 0.0002, "epoch": 1.7730150998538723, "step": 1820}, {"loss": 1.7128, "grad_norm": 0.33182016015052795, "learning_rate": 0.0002, "epoch": 1.7827569410618607, "step": 1830}, {"loss": 1.7594, "grad_norm": 0.33359771966934204, "learning_rate": 0.0002, "epoch": 1.792498782269849, "step": 1840}, {"loss": 1.8611, "grad_norm": 0.38070961833000183, "learning_rate": 0.0002, "epoch": 1.8022406234778372, "step": 1850}, {"loss": 1.7576, "grad_norm": 0.34111160039901733, "learning_rate": 0.0002, "epoch": 1.8119824646858256, "step": 1860}, {"loss": 1.7533, "grad_norm": 0.4439302980899811, "learning_rate": 0.0002, "epoch": 1.821724305893814, "step": 1870}, {"loss": 1.7931, "grad_norm": 0.37065210938453674, "learning_rate": 0.0002, "epoch": 1.8314661471018021, "step": 1880}, {"loss": 1.7392, "grad_norm": 0.33630406856536865, "learning_rate": 0.0002, "epoch": 1.8412079883097907, "step": 1890}, {"loss": 1.6369, "grad_norm": 0.334553986787796, "learning_rate": 0.0002, "epoch": 1.8509498295177789, "step": 1900}, {"loss": 1.7271, "grad_norm": 0.3603808879852295, "learning_rate": 0.0002, "epoch": 1.860691670725767, "step": 1910}, {"loss": 1.6777, "grad_norm": 0.4307343363761902, "learning_rate": 0.0002, "epoch": 1.8704335119337556, "step": 1920}, {"loss": 1.75, "grad_norm": 0.455602765083313, "learning_rate": 0.0002, "epoch": 1.8801753531417438, "step": 1930}, {"loss": 1.6898, "grad_norm": 0.35242316126823425, "learning_rate": 0.0002, "epoch": 1.8899171943497322, "step": 1940}, {"loss": 1.7152, "grad_norm": 0.3589116632938385, "learning_rate": 0.0002, "epoch": 1.8996590355577205, "step": 1950}, {"loss": 1.7125, "grad_norm": 0.3540741801261902, "learning_rate": 0.0002, "epoch": 1.9094008767657087, "step": 1960}, {"loss": 1.6873, "grad_norm": 0.3547612428665161, "learning_rate": 0.0002, "epoch": 1.919142717973697, "step": 1970}, {"loss": 1.6995, "grad_norm": 0.3485773503780365, "learning_rate": 0.0002, "epoch": 1.9288845591816854, "step": 1980}, {"loss": 1.7301, "grad_norm": 0.3560304641723633, "learning_rate": 0.0002, "epoch": 1.9386264003896736, "step": 1990}, {"loss": 1.748, "grad_norm": 0.33299335837364197, "learning_rate": 0.0002, "epoch": 1.948368241597662, "step": 2000}, {"loss": 1.7397, "grad_norm": 0.35622233152389526, "learning_rate": 0.0002, "epoch": 1.9581100828056504, "step": 2010}, {"loss": 1.7201, "grad_norm": 0.3681301474571228, "learning_rate": 0.0002, "epoch": 1.9678519240136385, "step": 2020}, {"loss": 1.717, "grad_norm": 0.36158084869384766, "learning_rate": 0.0002, "epoch": 1.9775937652216269, "step": 2030}, {"loss": 1.6332, "grad_norm": 0.32560569047927856, "learning_rate": 0.0002, "epoch": 1.9873356064296153, "step": 2040}, {"loss": 1.6958, "grad_norm": 0.37404149770736694, "learning_rate": 0.0002, "epoch": 1.9970774476376034, "step": 2050}, {"eval_loss": 1.8119343519210815, "eval_runtime": 96.0045, "eval_samples_per_second": 5.281, "eval_steps_per_second": 0.667, "epoch": 2.0, "step": 2053}, {"loss": 1.657, "grad_norm": 0.374188631772995, "learning_rate": 0.0002, "epoch": 2.006819288845592, "step": 2060}, {"loss": 1.5655, "grad_norm": 0.421764075756073, "learning_rate": 0.0002, "epoch": 2.01656113005358, "step": 2070}, {"loss": 1.548, "grad_norm": 0.43841829895973206, "learning_rate": 0.0002, "epoch": 2.0263029712615683, "step": 2080}, {"loss": 1.6326, "grad_norm": 0.42298218607902527, "learning_rate": 0.0002, "epoch": 2.036044812469557, "step": 2090}, {"loss": 1.5883, "grad_norm": 0.43669602274894714, "learning_rate": 0.0002, "epoch": 2.045786653677545, "step": 2100}, {"loss": 1.6143, "grad_norm": 0.4080469012260437, "learning_rate": 0.0002, "epoch": 2.0555284948855332, "step": 2110}, {"loss": 1.5591, "grad_norm": 0.483192503452301, "learning_rate": 0.0002, "epoch": 2.065270336093522, "step": 2120}, {"loss": 1.6492, "grad_norm": 0.44427400827407837, "learning_rate": 0.0002, "epoch": 2.07501217730151, "step": 2130}, {"loss": 1.5845, "grad_norm": 0.48835131525993347, "learning_rate": 0.0002, "epoch": 2.084754018509498, "step": 2140}, {"loss": 1.5617, "grad_norm": 0.42733684182167053, "learning_rate": 0.0002, "epoch": 2.0944958597174868, "step": 2150}, {"loss": 1.5562, "grad_norm": 0.4258694648742676, "learning_rate": 0.0002, "epoch": 2.104237700925475, "step": 2160}, {"loss": 1.517, "grad_norm": 0.5164985656738281, "learning_rate": 0.0002, "epoch": 2.113979542133463, "step": 2170}, {"loss": 1.6393, "grad_norm": 0.4279228150844574, "learning_rate": 0.0002, "epoch": 2.1237213833414517, "step": 2180}, {"loss": 1.5895, "grad_norm": 0.48209506273269653, "learning_rate": 0.0002, "epoch": 2.13346322454944, "step": 2190}, {"loss": 1.6142, "grad_norm": 0.4071785509586334, "learning_rate": 0.0002, "epoch": 2.143205065757428, "step": 2200}, {"loss": 1.5469, "grad_norm": 0.4629398584365845, "learning_rate": 0.0002, "epoch": 2.1529469069654166, "step": 2210}, {"loss": 1.56, "grad_norm": 0.44390997290611267, "learning_rate": 0.0002, "epoch": 2.1626887481734047, "step": 2220}, {"loss": 1.5395, "grad_norm": 0.46886971592903137, "learning_rate": 0.0002, "epoch": 2.172430589381393, "step": 2230}, {"loss": 1.6108, "grad_norm": 0.43745434284210205, "learning_rate": 0.0002, "epoch": 2.1821724305893815, "step": 2240}, {"loss": 1.5416, "grad_norm": 0.42737245559692383, "learning_rate": 0.0002, "epoch": 2.1919142717973696, "step": 2250}, {"loss": 1.627, "grad_norm": 0.5028428435325623, "learning_rate": 0.0002, "epoch": 2.201656113005358, "step": 2260}, {"loss": 1.6148, "grad_norm": 0.48987212777137756, "learning_rate": 0.0002, "epoch": 2.2113979542133464, "step": 2270}, {"loss": 1.6107, "grad_norm": 0.48186370730400085, "learning_rate": 0.0002, "epoch": 2.2211397954213346, "step": 2280}, {"loss": 1.6657, "grad_norm": 0.4417429566383362, "learning_rate": 0.0002, "epoch": 2.2308816366293227, "step": 2290}, {"loss": 1.595, "grad_norm": 0.4757710099220276, "learning_rate": 0.0002, "epoch": 2.2406234778373113, "step": 2300}, {"loss": 1.591, "grad_norm": 0.44449448585510254, "learning_rate": 0.0002, "epoch": 2.2503653190452995, "step": 2310}, {"loss": 1.5742, "grad_norm": 0.5070863962173462, "learning_rate": 0.0002, "epoch": 2.260107160253288, "step": 2320}, {"loss": 1.5831, "grad_norm": 0.4967133700847626, "learning_rate": 0.0002, "epoch": 2.269849001461276, "step": 2330}, {"loss": 1.5857, "grad_norm": 0.5110220909118652, "learning_rate": 0.0002, "epoch": 2.2795908426692644, "step": 2340}, {"loss": 1.6266, "grad_norm": 0.47984135150909424, "learning_rate": 0.0002, "epoch": 2.289332683877253, "step": 2350}, {"loss": 1.5927, "grad_norm": 0.5005794763565063, "learning_rate": 0.0002, "epoch": 2.299074525085241, "step": 2360}, {"loss": 1.6131, "grad_norm": 0.4991425573825836, "learning_rate": 0.0002, "epoch": 2.3088163662932293, "step": 2370}, {"loss": 1.5386, "grad_norm": 0.4948616623878479, "learning_rate": 0.0002, "epoch": 2.318558207501218, "step": 2380}, {"loss": 1.5769, "grad_norm": 0.4533160328865051, "learning_rate": 0.0002, "epoch": 2.328300048709206, "step": 2390}, {"loss": 1.5856, "grad_norm": 0.5871071219444275, "learning_rate": 0.0002, "epoch": 2.338041889917194, "step": 2400}, {"loss": 1.591, "grad_norm": 0.5048075914382935, "learning_rate": 0.0002, "epoch": 2.347783731125183, "step": 2410}, {"loss": 1.6165, "grad_norm": 0.4973750412464142, "learning_rate": 0.0002, "epoch": 2.357525572333171, "step": 2420}, {"loss": 1.589, "grad_norm": 0.48294538259506226, "learning_rate": 0.0002, "epoch": 2.367267413541159, "step": 2430}, {"loss": 1.5782, "grad_norm": 0.7180454134941101, "learning_rate": 0.0002, "epoch": 2.3770092547491477, "step": 2440}, {"loss": 1.56, "grad_norm": 0.4627632796764374, "learning_rate": 0.0002, "epoch": 2.386751095957136, "step": 2450}, {"loss": 1.586, "grad_norm": 0.4834378957748413, "learning_rate": 0.0002, "epoch": 2.396492937165124, "step": 2460}, {"loss": 1.6145, "grad_norm": 0.5173670649528503, "learning_rate": 0.0002, "epoch": 2.4062347783731126, "step": 2470}, {"loss": 1.5464, "grad_norm": 0.49652737379074097, "learning_rate": 0.0002, "epoch": 2.4159766195811008, "step": 2480}, {"loss": 1.6977, "grad_norm": 0.47052669525146484, "learning_rate": 0.0002, "epoch": 2.4257184607890894, "step": 2490}, {"loss": 1.6215, "grad_norm": 0.5188006162643433, "learning_rate": 0.0002, "epoch": 2.4354603019970775, "step": 2500}, {"loss": 1.5512, "grad_norm": 0.5010119676589966, "learning_rate": 0.0002, "epoch": 2.4452021432050657, "step": 2510}, {"loss": 1.6403, "grad_norm": 0.4765235483646393, "learning_rate": 0.0002, "epoch": 2.4549439844130543, "step": 2520}, {"loss": 1.5907, "grad_norm": 0.5292699337005615, "learning_rate": 0.0002, "epoch": 2.4646858256210424, "step": 2530}, {"loss": 1.5866, "grad_norm": 0.48555099964141846, "learning_rate": 0.0002, "epoch": 2.4744276668290306, "step": 2540}, {"loss": 1.5361, "grad_norm": 0.4764043092727661, "learning_rate": 0.0002, "epoch": 2.484169508037019, "step": 2550}, {"loss": 1.6545, "grad_norm": 0.47839659452438354, "learning_rate": 0.0002, "epoch": 2.4939113492450073, "step": 2560}, {"loss": 1.6269, "grad_norm": 0.4514436721801758, "learning_rate": 0.0002, "epoch": 2.5036531904529955, "step": 2570}, {"loss": 1.6389, "grad_norm": 0.5681955218315125, "learning_rate": 0.0002, "epoch": 2.513395031660984, "step": 2580}, {"loss": 1.5976, "grad_norm": 0.49655985832214355, "learning_rate": 0.0002, "epoch": 2.5231368728689723, "step": 2590}, {"loss": 1.6052, "grad_norm": 0.5077657103538513, "learning_rate": 0.0002, "epoch": 2.5328787140769604, "step": 2600}, {"loss": 1.5658, "grad_norm": 0.5643279552459717, "learning_rate": 0.0002, "epoch": 2.542620555284949, "step": 2610}, {"loss": 1.5455, "grad_norm": 0.4715031087398529, "learning_rate": 0.0002, "epoch": 2.552362396492937, "step": 2620}, {"loss": 1.5907, "grad_norm": 0.528400719165802, "learning_rate": 0.0002, "epoch": 2.5621042377009253, "step": 2630}, {"loss": 1.5452, "grad_norm": 0.49469611048698425, "learning_rate": 0.0002, "epoch": 2.571846078908914, "step": 2640}, {"loss": 1.5904, "grad_norm": 0.4567806124687195, "learning_rate": 0.0002, "epoch": 2.581587920116902, "step": 2650}, {"loss": 1.6242, "grad_norm": 0.5357107520103455, "learning_rate": 0.0002, "epoch": 2.5913297613248902, "step": 2660}, {"loss": 1.6078, "grad_norm": 0.46977677941322327, "learning_rate": 0.0002, "epoch": 2.601071602532879, "step": 2670}, {"loss": 1.6994, "grad_norm": 0.6626771092414856, "learning_rate": 0.0002, "epoch": 2.610813443740867, "step": 2680}, {"loss": 1.5888, "grad_norm": 0.4587472081184387, "learning_rate": 0.0002, "epoch": 2.620555284948855, "step": 2690}, {"loss": 1.6002, "grad_norm": 0.4816797077655792, "learning_rate": 0.0002, "epoch": 2.6302971261568437, "step": 2700}, {"loss": 1.5701, "grad_norm": 0.4856809675693512, "learning_rate": 0.0002, "epoch": 2.640038967364832, "step": 2710}, {"loss": 1.563, "grad_norm": 0.46010780334472656, "learning_rate": 0.0002, "epoch": 2.64978080857282, "step": 2720}, {"loss": 1.5374, "grad_norm": 0.4637954533100128, "learning_rate": 0.0002, "epoch": 2.6595226497808087, "step": 2730}, {"loss": 1.6493, "grad_norm": 0.5954997539520264, "learning_rate": 0.0002, "epoch": 2.669264490988797, "step": 2740}, {"loss": 1.5795, "grad_norm": 0.5071861743927002, "learning_rate": 0.0002, "epoch": 2.679006332196785, "step": 2750}, {"loss": 1.573, "grad_norm": 0.5415477156639099, "learning_rate": 0.0002, "epoch": 2.6887481734047736, "step": 2760}, {"loss": 1.5476, "grad_norm": 0.5618549585342407, "learning_rate": 0.0002, "epoch": 2.6984900146127617, "step": 2770}, {"loss": 1.608, "grad_norm": 0.49338817596435547, "learning_rate": 0.0002, "epoch": 2.70823185582075, "step": 2780}, {"loss": 1.6529, "grad_norm": 0.5149586796760559, "learning_rate": 0.0002, "epoch": 2.7179736970287385, "step": 2790}, {"loss": 1.6279, "grad_norm": 0.6247242093086243, "learning_rate": 0.0002, "epoch": 2.7277155382367266, "step": 2800}, {"loss": 1.4655, "grad_norm": 0.4749542474746704, "learning_rate": 0.0002, "epoch": 2.737457379444715, "step": 2810}, {"loss": 1.5984, "grad_norm": 0.4979191720485687, "learning_rate": 0.0002, "epoch": 2.7471992206527034, "step": 2820}, {"loss": 1.6377, "grad_norm": 0.4885074198246002, "learning_rate": 0.0002, "epoch": 2.7569410618606915, "step": 2830}, {"loss": 1.6529, "grad_norm": 0.5047747492790222, "learning_rate": 0.0002, "epoch": 2.76668290306868, "step": 2840}, {"loss": 1.6574, "grad_norm": 0.5280140042304993, "learning_rate": 0.0002, "epoch": 2.7764247442766683, "step": 2850}, {"loss": 1.5639, "grad_norm": 0.477668434381485, "learning_rate": 0.0002, "epoch": 2.7861665854846565, "step": 2860}, {"loss": 1.5923, "grad_norm": 0.4816327393054962, "learning_rate": 0.0002, "epoch": 2.795908426692645, "step": 2870}, {"loss": 1.6377, "grad_norm": 0.523259162902832, "learning_rate": 0.0002, "epoch": 2.805650267900633, "step": 2880}, {"loss": 1.5779, "grad_norm": 0.5045270919799805, "learning_rate": 0.0002, "epoch": 2.8153921091086214, "step": 2890}, {"loss": 1.5915, "grad_norm": 0.47986042499542236, "learning_rate": 0.0002, "epoch": 2.82513395031661, "step": 2900}, {"loss": 1.5997, "grad_norm": 0.4858797490596771, "learning_rate": 0.0002, "epoch": 2.834875791524598, "step": 2910}, {"loss": 1.5664, "grad_norm": 0.5261512398719788, "learning_rate": 0.0002, "epoch": 2.8446176327325867, "step": 2920}, {"loss": 1.5775, "grad_norm": 0.630550742149353, "learning_rate": 0.0002, "epoch": 2.854359473940575, "step": 2930}, {"loss": 1.5889, "grad_norm": 0.49119752645492554, "learning_rate": 0.0002, "epoch": 2.864101315148563, "step": 2940}, {"loss": 1.6033, "grad_norm": 0.4779070317745209, "learning_rate": 0.0002, "epoch": 2.8738431563565516, "step": 2950}, {"loss": 1.6353, "grad_norm": 0.5059782266616821, "learning_rate": 0.0002, "epoch": 2.88358499756454, "step": 2960}, {"loss": 1.6403, "grad_norm": 0.5466655492782593, "learning_rate": 0.0002, "epoch": 2.893326838772528, "step": 2970}, {"loss": 1.619, "grad_norm": 0.4865640103816986, "learning_rate": 0.0002, "epoch": 2.9030686799805165, "step": 2980}, {"loss": 1.5712, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.9128105211885047, "step": 2990}, {"loss": 1.6216, "grad_norm": 0.9112305641174316, "learning_rate": 0.0002, "epoch": 2.922552362396493, "step": 3000}, {"loss": 1.6493, "grad_norm": 0.3938814103603363, "learning_rate": 0.0002, "epoch": 2.9322942036044815, "step": 3010}, {"loss": 1.5446, "grad_norm": 0.5500800609588623, "learning_rate": 0.0002, "epoch": 2.9420360448124696, "step": 3020}, {"loss": 1.5626, "grad_norm": 0.5346390604972839, "learning_rate": 0.0002, "epoch": 2.9517778860204578, "step": 3030}, {"loss": 1.5783, "grad_norm": 0.5245014429092407, "learning_rate": 0.0002, "epoch": 2.9615197272284464, "step": 3040}, {"loss": 1.5549, "grad_norm": 0.4906884431838989, "learning_rate": 0.0002, "epoch": 2.9712615684364345, "step": 3050}, {"loss": 1.5796, "grad_norm": 0.47086769342422485, "learning_rate": 0.0002, "epoch": 2.9810034096444227, "step": 3060}, {"loss": 1.5917, "grad_norm": 0.5290229320526123, "learning_rate": 0.0002, "epoch": 2.9907452508524113, "step": 3070}, {"eval_loss": 1.8463934659957886, "eval_runtime": 56.2401, "eval_samples_per_second": 9.015, "eval_steps_per_second": 1.138, "epoch": 2.9995129079396006, "step": 3079}, {"loss": 1.5744, "grad_norm": 0.49992576241493225, "learning_rate": 0.0002, "epoch": 3.0004870920603994, "step": 3080}, {"loss": 1.4125, "grad_norm": 0.8242783546447754, "learning_rate": 0.0002, "epoch": 3.0102289332683876, "step": 3090}, {"loss": 1.394, "grad_norm": 0.6330569386482239, "learning_rate": 0.0002, "epoch": 3.019970774476376, "step": 3100}, {"loss": 1.4942, "grad_norm": 0.566097617149353, "learning_rate": 0.0002, "epoch": 3.0297126156843643, "step": 3110}, {"loss": 1.4365, "grad_norm": 0.6337586045265198, "learning_rate": 0.0002, "epoch": 3.0394544568923525, "step": 3120}, {"loss": 1.3916, "grad_norm": 0.7339403033256531, "learning_rate": 0.0002, "epoch": 3.049196298100341, "step": 3130}, {"loss": 1.4617, "grad_norm": 0.7187346816062927, "learning_rate": 0.0002, "epoch": 3.0589381393083293, "step": 3140}, {"loss": 1.3453, "grad_norm": 0.7116255760192871, "learning_rate": 0.0002, "epoch": 3.0686799805163174, "step": 3150}, {"loss": 1.4452, "grad_norm": 0.6493807435035706, "learning_rate": 0.0002, "epoch": 3.078421821724306, "step": 3160}, {"loss": 1.351, "grad_norm": 0.6777266263961792, "learning_rate": 0.0002, "epoch": 3.088163662932294, "step": 3170}, {"loss": 1.4362, "grad_norm": 0.6342006325721741, "learning_rate": 0.0002, "epoch": 3.0979055041402823, "step": 3180}, {"loss": 1.4748, "grad_norm": 0.6608964204788208, "learning_rate": 0.0002, "epoch": 3.107647345348271, "step": 3190}, {"loss": 1.375, "grad_norm": 0.7230247259140015, "learning_rate": 0.0002, "epoch": 3.117389186556259, "step": 3200}, {"loss": 1.4049, "grad_norm": 0.650368332862854, "learning_rate": 0.0002, "epoch": 3.1271310277642472, "step": 3210}, {"loss": 1.409, "grad_norm": 0.7319342494010925, "learning_rate": 0.0002, "epoch": 3.136872868972236, "step": 3220}, {"loss": 1.3872, "grad_norm": 0.7159963846206665, "learning_rate": 0.0002, "epoch": 3.146614710180224, "step": 3230}, {"loss": 1.5076, "grad_norm": 0.8905230164527893, "learning_rate": 0.0002, "epoch": 3.156356551388212, "step": 3240}, {"loss": 1.3161, "grad_norm": 0.6920804381370544, "learning_rate": 0.0002, "epoch": 3.1660983925962007, "step": 3250}, {"loss": 1.3786, "grad_norm": 0.6782063841819763, "learning_rate": 0.0002, "epoch": 3.175840233804189, "step": 3260}, {"loss": 1.5153, "grad_norm": 0.735325276851654, "learning_rate": 0.0002, "epoch": 3.1855820750121775, "step": 3270}, {"loss": 1.4027, "grad_norm": 0.6657978296279907, "learning_rate": 0.0002, "epoch": 3.1953239162201656, "step": 3280}, {"loss": 1.3456, "grad_norm": 0.771315336227417, "learning_rate": 0.0002, "epoch": 3.205065757428154, "step": 3290}, {"loss": 1.3236, "grad_norm": 0.6492983102798462, "learning_rate": 0.0002, "epoch": 3.2148075986361424, "step": 3300}, {"loss": 1.4125, "grad_norm": 0.7513770461082458, "learning_rate": 0.0002, "epoch": 3.2245494398441306, "step": 3310}, {"loss": 1.4032, "grad_norm": 0.7091423869132996, "learning_rate": 0.0002, "epoch": 3.2342912810521187, "step": 3320}, {"loss": 1.4585, "grad_norm": 0.6663975119590759, "learning_rate": 0.0002, "epoch": 3.2440331222601073, "step": 3330}, {"loss": 1.3968, "grad_norm": 0.6813122034072876, "learning_rate": 0.0002, "epoch": 3.2537749634680955, "step": 3340}, {"loss": 1.3681, "grad_norm": 0.6602569818496704, "learning_rate": 0.0002, "epoch": 3.2635168046760836, "step": 3350}, {"loss": 1.4533, "grad_norm": 0.718270480632782, "learning_rate": 0.0002, "epoch": 3.2732586458840722, "step": 3360}, {"loss": 1.4076, "grad_norm": 0.6884173154830933, "learning_rate": 0.0002, "epoch": 3.2830004870920604, "step": 3370}, {"loss": 1.4144, "grad_norm": 0.7039775848388672, "learning_rate": 0.0002, "epoch": 3.2927423283000485, "step": 3380}, {"loss": 1.5077, "grad_norm": 0.7444299459457397, "learning_rate": 0.0002, "epoch": 3.302484169508037, "step": 3390}, {"loss": 1.4255, "grad_norm": 0.7187064290046692, "learning_rate": 0.0002, "epoch": 3.3122260107160253, "step": 3400}, {"loss": 1.3684, "grad_norm": 0.599396288394928, "learning_rate": 0.0002, "epoch": 3.3219678519240134, "step": 3410}, {"loss": 1.4819, "grad_norm": 0.7670390009880066, "learning_rate": 0.0002, "epoch": 3.331709693132002, "step": 3420}, {"loss": 1.4411, "grad_norm": 0.6654478311538696, "learning_rate": 0.0002, "epoch": 3.34145153433999, "step": 3430}, {"loss": 1.4257, "grad_norm": 0.6644385457038879, "learning_rate": 0.0002, "epoch": 3.351193375547979, "step": 3440}, {"loss": 1.4508, "grad_norm": 0.6974098086357117, "learning_rate": 0.0002, "epoch": 3.360935216755967, "step": 3450}, {"loss": 1.3807, "grad_norm": 0.7350399494171143, "learning_rate": 0.0002, "epoch": 3.370677057963955, "step": 3460}, {"loss": 1.4176, "grad_norm": 0.714721143245697, "learning_rate": 0.0002, "epoch": 3.3804188991719437, "step": 3470}, {"loss": 1.4325, "grad_norm": 0.7006027698516846, "learning_rate": 0.0002, "epoch": 3.390160740379932, "step": 3480}, {"loss": 1.4888, "grad_norm": 0.6767925024032593, "learning_rate": 0.0002, "epoch": 3.39990258158792, "step": 3490}, {"loss": 1.4116, "grad_norm": 0.6721355319023132, "learning_rate": 0.0002, "epoch": 3.4096444227959086, "step": 3500}, {"loss": 1.443, "grad_norm": 0.6845725178718567, "learning_rate": 0.0002, "epoch": 3.419386264003897, "step": 3510}, {"loss": 1.4832, "grad_norm": 0.6882196664810181, "learning_rate": 0.0002, "epoch": 3.429128105211885, "step": 3520}, {"loss": 1.4962, "grad_norm": 0.7663240432739258, "learning_rate": 0.0002, "epoch": 3.4388699464198735, "step": 3530}, {"loss": 1.4644, "grad_norm": 0.6304219365119934, "learning_rate": 0.0002, "epoch": 3.4486117876278617, "step": 3540}, {"loss": 1.4918, "grad_norm": 0.668678879737854, "learning_rate": 0.0002, "epoch": 3.45835362883585, "step": 3550}, {"loss": 1.4874, "grad_norm": 0.7526912093162537, "learning_rate": 0.0002, "epoch": 3.4680954700438384, "step": 3560}, {"loss": 1.4249, "grad_norm": 1.089495301246643, "learning_rate": 0.0002, "epoch": 3.4778373112518266, "step": 3570}, {"loss": 1.3871, "grad_norm": 0.7282902002334595, "learning_rate": 0.0002, "epoch": 3.4875791524598148, "step": 3580}, {"loss": 1.5077, "grad_norm": 0.6540156602859497, "learning_rate": 0.0002, "epoch": 3.4973209936678034, "step": 3590}, {"loss": 1.4367, "grad_norm": 0.6449568867683411, "learning_rate": 0.0002, "epoch": 3.5070628348757915, "step": 3600}, {"loss": 1.4532, "grad_norm": 0.7262216210365295, "learning_rate": 0.0002, "epoch": 3.5168046760837797, "step": 3610}, {"loss": 1.4374, "grad_norm": 0.6048615574836731, "learning_rate": 0.0002, "epoch": 3.5265465172917683, "step": 3620}, {"loss": 1.3877, "grad_norm": 0.6780537366867065, "learning_rate": 0.0002, "epoch": 3.5362883584997564, "step": 3630}, {"loss": 1.422, "grad_norm": 0.6851925253868103, "learning_rate": 0.0002, "epoch": 3.5460301997077446, "step": 3640}, {"loss": 1.3425, "grad_norm": 0.6530634164810181, "learning_rate": 0.0002, "epoch": 3.555772040915733, "step": 3650}, {"loss": 1.4879, "grad_norm": 0.7193992733955383, "learning_rate": 0.0002, "epoch": 3.5655138821237213, "step": 3660}, {"loss": 1.4555, "grad_norm": 0.767496645450592, "learning_rate": 0.0002, "epoch": 3.5752557233317095, "step": 3670}, {"loss": 1.4824, "grad_norm": 0.6912919282913208, "learning_rate": 0.0002, "epoch": 3.584997564539698, "step": 3680}, {"loss": 1.4497, "grad_norm": 0.7383436560630798, "learning_rate": 0.0002, "epoch": 3.5947394057476862, "step": 3690}, {"loss": 1.4822, "grad_norm": 0.6746662855148315, "learning_rate": 0.0002, "epoch": 3.6044812469556744, "step": 3700}, {"loss": 1.4904, "grad_norm": 0.6885138750076294, "learning_rate": 0.0002, "epoch": 3.614223088163663, "step": 3710}, {"loss": 1.4044, "grad_norm": 0.6694392561912537, "learning_rate": 0.0002, "epoch": 3.623964929371651, "step": 3720}, {"loss": 1.3719, "grad_norm": 0.812358021736145, "learning_rate": 0.0002, "epoch": 3.6337067705796393, "step": 3730}, {"loss": 1.4603, "grad_norm": 0.7267130017280579, "learning_rate": 0.0002, "epoch": 3.643448611787628, "step": 3740}, {"loss": 1.4574, "grad_norm": 0.6958749294281006, "learning_rate": 0.0002, "epoch": 3.653190452995616, "step": 3750}, {"loss": 1.4346, "grad_norm": 0.6805673241615295, "learning_rate": 0.0002, "epoch": 3.6629322942036042, "step": 3760}, {"loss": 1.4338, "grad_norm": 0.7184410095214844, "learning_rate": 0.0002, "epoch": 3.672674135411593, "step": 3770}, {"loss": 1.3935, "grad_norm": 0.7716330289840698, "learning_rate": 0.0002, "epoch": 3.682415976619581, "step": 3780}, {"loss": 1.384, "grad_norm": 0.6675831079483032, "learning_rate": 0.0002, "epoch": 3.6921578178275696, "step": 3790}, {"loss": 1.401, "grad_norm": 0.6480095386505127, "learning_rate": 0.0002, "epoch": 3.7018996590355577, "step": 3800}, {"loss": 1.5303, "grad_norm": 0.6559418439865112, "learning_rate": 0.0002, "epoch": 3.711641500243546, "step": 3810}, {"loss": 1.4341, "grad_norm": 0.6596545577049255, "learning_rate": 0.0002, "epoch": 3.7213833414515345, "step": 3820}, {"loss": 1.4508, "grad_norm": 0.7172950506210327, "learning_rate": 0.0002, "epoch": 3.7311251826595226, "step": 3830}, {"loss": 1.446, "grad_norm": 0.796148419380188, "learning_rate": 0.0002, "epoch": 3.740867023867511, "step": 3840}, {"loss": 1.4992, "grad_norm": 0.6600322723388672, "learning_rate": 0.0002, "epoch": 3.7506088650754994, "step": 3850}, {"loss": 1.4201, "grad_norm": 0.6776387691497803, "learning_rate": 0.0002, "epoch": 3.7603507062834876, "step": 3860}, {"loss": 1.3893, "grad_norm": 0.7768304347991943, "learning_rate": 0.0002, "epoch": 3.770092547491476, "step": 3870}, {"loss": 1.4886, "grad_norm": 1.0579794645309448, "learning_rate": 0.0002, "epoch": 3.7798343886994643, "step": 3880}, {"loss": 1.4556, "grad_norm": 0.6757252812385559, "learning_rate": 0.0002, "epoch": 3.7895762299074525, "step": 3890}, {"loss": 1.4647, "grad_norm": 0.6706996560096741, "learning_rate": 0.0002, "epoch": 3.799318071115441, "step": 3900}, {"loss": 1.4104, "grad_norm": 0.7026948928833008, "learning_rate": 0.0002, "epoch": 3.809059912323429, "step": 3910}, {"loss": 1.5487, "grad_norm": 0.6437768340110779, "learning_rate": 0.0002, "epoch": 3.8188017535314174, "step": 3920}, {"loss": 1.4678, "grad_norm": 0.7015706300735474, "learning_rate": 0.0002, "epoch": 3.828543594739406, "step": 3930}, {"loss": 1.4891, "grad_norm": 0.7049482464790344, "learning_rate": 0.0002, "epoch": 3.838285435947394, "step": 3940}, {"loss": 1.4208, "grad_norm": 0.6533724665641785, "learning_rate": 0.0002, "epoch": 3.8480272771553823, "step": 3950}, {"loss": 1.4435, "grad_norm": 0.7312499284744263, "learning_rate": 0.0002, "epoch": 3.857769118363371, "step": 3960}, {"loss": 1.3886, "grad_norm": 0.6858801245689392, "learning_rate": 0.0002, "epoch": 3.867510959571359, "step": 3970}, {"loss": 1.4423, "grad_norm": 0.770423173904419, "learning_rate": 0.0002, "epoch": 3.877252800779347, "step": 3980}, {"loss": 1.5029, "grad_norm": 0.6987539529800415, "learning_rate": 0.0002, "epoch": 3.886994641987336, "step": 3990}, {"loss": 1.4791, "grad_norm": 0.7072722315788269, "learning_rate": 0.0002, "epoch": 3.896736483195324, "step": 4000}, {"loss": 1.528, "grad_norm": 0.6492931842803955, "learning_rate": 0.0002, "epoch": 3.906478324403312, "step": 4010}, {"loss": 1.3824, "grad_norm": 0.7716232538223267, "learning_rate": 0.0002, "epoch": 3.9162201656113007, "step": 4020}, {"loss": 1.4758, "grad_norm": 0.722949743270874, "learning_rate": 0.0002, "epoch": 3.925962006819289, "step": 4030}, {"loss": 1.3914, "grad_norm": 0.7434365749359131, "learning_rate": 0.0002, "epoch": 3.935703848027277, "step": 4040}, {"loss": 1.4763, "grad_norm": 0.6691509485244751, "learning_rate": 0.0002, "epoch": 3.9454456892352656, "step": 4050}, {"loss": 1.4555, "grad_norm": 0.6850284337997437, "learning_rate": 0.0002, "epoch": 3.9551875304432538, "step": 4060}, {"loss": 1.5275, "grad_norm": 0.6954452991485596, "learning_rate": 0.0002, "epoch": 3.964929371651242, "step": 4070}, {"loss": 1.417, "grad_norm": 0.9316364526748657, "learning_rate": 0.0002, "epoch": 3.9746712128592305, "step": 4080}, {"loss": 1.4532, "grad_norm": 0.6908289194107056, "learning_rate": 0.0002, "epoch": 3.9844130540672187, "step": 4090}, {"loss": 1.4404, "grad_norm": 0.666782021522522, "learning_rate": 0.0002, "epoch": 3.994154895275207, "step": 4100}, {"eval_loss": 1.9233275651931763, "eval_runtime": 55.9536, "eval_samples_per_second": 9.061, "eval_steps_per_second": 1.144, "epoch": 4.0, "step": 4106}, {"loss": 1.3489, "grad_norm": 0.7726166248321533, "learning_rate": 0.0002, "epoch": 4.003896736483195, "step": 4110}, {"loss": 1.1415, "grad_norm": 1.1338967084884644, "learning_rate": 0.0002, "epoch": 4.013638577691184, "step": 4120}, {"loss": 1.2212, "grad_norm": 0.9530029296875, "learning_rate": 0.0002, "epoch": 4.023380418899172, "step": 4130}, {"loss": 1.2002, "grad_norm": 1.1058554649353027, "learning_rate": 0.0002, "epoch": 4.03312226010716, "step": 4140}, {"loss": 1.2381, "grad_norm": 0.8765049576759338, "learning_rate": 0.0002, "epoch": 4.042864101315149, "step": 4150}, {"loss": 1.2708, "grad_norm": 1.1774667501449585, "learning_rate": 0.0002, "epoch": 4.052605942523137, "step": 4160}, {"loss": 1.2116, "grad_norm": 0.9301433563232422, "learning_rate": 0.0002, "epoch": 4.062347783731125, "step": 4170}, {"loss": 1.1807, "grad_norm": 1.0196778774261475, "learning_rate": 0.0002, "epoch": 4.072089624939114, "step": 4180}, {"loss": 1.2602, "grad_norm": 1.1380577087402344, "learning_rate": 0.0002, "epoch": 4.081831466147102, "step": 4190}, {"loss": 1.2521, "grad_norm": 0.9121319651603699, "learning_rate": 0.0002, "epoch": 4.09157330735509, "step": 4200}, {"loss": 1.1747, "grad_norm": 0.9495378732681274, "learning_rate": 0.0002, "epoch": 4.101315148563079, "step": 4210}, {"loss": 1.1829, "grad_norm": 0.8058680295944214, "learning_rate": 0.0002, "epoch": 4.1110569897710665, "step": 4220}, {"loss": 1.1732, "grad_norm": 1.000887393951416, "learning_rate": 0.0002, "epoch": 4.120798830979055, "step": 4230}, {"loss": 1.1947, "grad_norm": 0.9529102444648743, "learning_rate": 0.0002, "epoch": 4.130540672187044, "step": 4240}, {"loss": 1.2104, "grad_norm": 1.0257115364074707, "learning_rate": 0.0002, "epoch": 4.140282513395031, "step": 4250}, {"loss": 1.2293, "grad_norm": 0.9590303897857666, "learning_rate": 0.0002, "epoch": 4.15002435460302, "step": 4260}, {"loss": 1.1918, "grad_norm": 1.065291166305542, "learning_rate": 0.0002, "epoch": 4.159766195811009, "step": 4270}, {"loss": 1.2323, "grad_norm": 0.8819697499275208, "learning_rate": 0.0002, "epoch": 4.169508037018996, "step": 4280}, {"loss": 1.2167, "grad_norm": 1.0335261821746826, "learning_rate": 0.0002, "epoch": 4.179249878226985, "step": 4290}, {"loss": 1.2131, "grad_norm": 0.8872809410095215, "learning_rate": 0.0002, "epoch": 4.1889917194349735, "step": 4300}, {"loss": 1.2794, "grad_norm": 0.9883159399032593, "learning_rate": 0.0002, "epoch": 4.198733560642961, "step": 4310}, {"loss": 1.2544, "grad_norm": 1.0254192352294922, "learning_rate": 0.0002, "epoch": 4.20847540185095, "step": 4320}, {"loss": 1.2595, "grad_norm": 0.9432600736618042, "learning_rate": 0.0002, "epoch": 4.218217243058938, "step": 4330}, {"loss": 1.2684, "grad_norm": 1.1008676290512085, "learning_rate": 0.0002, "epoch": 4.227959084266926, "step": 4340}, {"loss": 1.2149, "grad_norm": 1.0829699039459229, "learning_rate": 0.0002, "epoch": 4.237700925474915, "step": 4350}, {"loss": 1.2621, "grad_norm": 1.016847848892212, "learning_rate": 0.0002, "epoch": 4.247442766682903, "step": 4360}, {"loss": 1.2375, "grad_norm": 0.8924864530563354, "learning_rate": 0.0002, "epoch": 4.257184607890891, "step": 4370}, {"loss": 1.1987, "grad_norm": 0.9300530552864075, "learning_rate": 0.0002, "epoch": 4.26692644909888, "step": 4380}, {"loss": 1.1696, "grad_norm": 0.9684814810752869, "learning_rate": 0.0002, "epoch": 4.276668290306868, "step": 4390}, {"loss": 1.2006, "grad_norm": 0.9916250705718994, "learning_rate": 0.0002, "epoch": 4.286410131514856, "step": 4400}, {"loss": 1.2402, "grad_norm": 0.903680145740509, "learning_rate": 0.0002, "epoch": 4.2961519727228445, "step": 4410}, {"loss": 1.2022, "grad_norm": 0.8713505268096924, "learning_rate": 0.0002, "epoch": 4.305893813930833, "step": 4420}, {"loss": 1.1957, "grad_norm": 0.9983905553817749, "learning_rate": 0.0002, "epoch": 4.315635655138821, "step": 4430}, {"loss": 1.2676, "grad_norm": 1.1689040660858154, "learning_rate": 0.0002, "epoch": 4.3253774963468095, "step": 4440}, {"loss": 1.2166, "grad_norm": 0.9316853880882263, "learning_rate": 0.0002, "epoch": 4.335119337554798, "step": 4450}, {"loss": 1.222, "grad_norm": 0.9175887107849121, "learning_rate": 0.0002, "epoch": 4.344861178762786, "step": 4460}, {"loss": 1.2571, "grad_norm": 0.9348906874656677, "learning_rate": 0.0002, "epoch": 4.354603019970774, "step": 4470}, {"loss": 1.2764, "grad_norm": 0.9727016687393188, "learning_rate": 0.0002, "epoch": 4.364344861178763, "step": 4480}, {"loss": 1.2616, "grad_norm": 0.9843429923057556, "learning_rate": 0.0002, "epoch": 4.374086702386751, "step": 4490}, {"loss": 1.2488, "grad_norm": 0.9615852236747742, "learning_rate": 0.0002, "epoch": 4.383828543594739, "step": 4500}, {"loss": 1.1718, "grad_norm": 0.9688583612442017, "learning_rate": 0.0002, "epoch": 4.393570384802728, "step": 4510}, {"loss": 1.2546, "grad_norm": 0.9933668375015259, "learning_rate": 0.0002, "epoch": 4.403312226010716, "step": 4520}, {"loss": 1.2355, "grad_norm": 1.0626686811447144, "learning_rate": 0.0002, "epoch": 4.413054067218704, "step": 4530}, {"loss": 1.2425, "grad_norm": 0.9536267518997192, "learning_rate": 0.0002, "epoch": 4.422795908426693, "step": 4540}, {"loss": 1.2562, "grad_norm": 0.9777140021324158, "learning_rate": 0.0002, "epoch": 4.432537749634681, "step": 4550}, {"loss": 1.2878, "grad_norm": 0.980780839920044, "learning_rate": 0.0002, "epoch": 4.442279590842669, "step": 4560}, {"loss": 1.2597, "grad_norm": 1.0147196054458618, "learning_rate": 0.0002, "epoch": 4.452021432050658, "step": 4570}, {"loss": 1.2148, "grad_norm": 0.9763361811637878, "learning_rate": 0.0002, "epoch": 4.461763273258645, "step": 4580}, {"loss": 1.3076, "grad_norm": 1.0300798416137695, "learning_rate": 0.0002, "epoch": 4.471505114466634, "step": 4590}, {"loss": 1.2665, "grad_norm": 0.8833121657371521, "learning_rate": 0.0002, "epoch": 4.481246955674623, "step": 4600}, {"loss": 1.1899, "grad_norm": 1.1214020252227783, "learning_rate": 0.0002, "epoch": 4.490988796882611, "step": 4610}, {"loss": 1.2579, "grad_norm": 0.8843787908554077, "learning_rate": 0.0002, "epoch": 4.500730638090599, "step": 4620}, {"loss": 1.2633, "grad_norm": 0.9942020773887634, "learning_rate": 0.0002, "epoch": 4.5104724792985875, "step": 4630}, {"loss": 1.3172, "grad_norm": 1.0033202171325684, "learning_rate": 0.0002, "epoch": 4.520214320506576, "step": 4640}, {"loss": 1.2024, "grad_norm": 0.8767235279083252, "learning_rate": 0.0002, "epoch": 4.529956161714564, "step": 4650}, {"loss": 1.2714, "grad_norm": 1.0117276906967163, "learning_rate": 0.0002, "epoch": 4.539698002922552, "step": 4660}, {"loss": 1.2911, "grad_norm": 1.2787362337112427, "learning_rate": 0.0002, "epoch": 4.549439844130541, "step": 4670}, {"loss": 1.2603, "grad_norm": 0.8824878931045532, "learning_rate": 0.0002, "epoch": 4.559181685338529, "step": 4680}, {"loss": 1.2905, "grad_norm": 0.9209560751914978, "learning_rate": 0.0002, "epoch": 4.568923526546517, "step": 4690}, {"loss": 1.1916, "grad_norm": 1.1064010858535767, "learning_rate": 0.0002, "epoch": 4.578665367754506, "step": 4700}, {"loss": 1.2217, "grad_norm": 0.8914572596549988, "learning_rate": 0.0002, "epoch": 4.588407208962494, "step": 4710}, {"loss": 1.2861, "grad_norm": 1.0412265062332153, "learning_rate": 0.0002, "epoch": 4.598149050170482, "step": 4720}, {"loss": 1.262, "grad_norm": 1.1950221061706543, "learning_rate": 0.0002, "epoch": 4.607890891378471, "step": 4730}, {"loss": 1.2659, "grad_norm": 0.8938062787055969, "learning_rate": 0.0002, "epoch": 4.617632732586459, "step": 4740}, {"loss": 1.2621, "grad_norm": 0.9849569201469421, "learning_rate": 0.0002, "epoch": 4.627374573794447, "step": 4750}, {"loss": 1.2341, "grad_norm": 1.0081515312194824, "learning_rate": 0.0002, "epoch": 4.637116415002436, "step": 4760}, {"loss": 1.2023, "grad_norm": 0.8566309213638306, "learning_rate": 0.0002, "epoch": 4.6468582562104235, "step": 4770}, {"loss": 1.2723, "grad_norm": 1.1750118732452393, "learning_rate": 0.0002, "epoch": 4.656600097418412, "step": 4780}, {"loss": 1.2537, "grad_norm": 0.925502598285675, "learning_rate": 0.0002, "epoch": 4.666341938626401, "step": 4790}, {"loss": 1.2146, "grad_norm": 1.0402472019195557, "learning_rate": 0.0002, "epoch": 4.676083779834388, "step": 4800}, {"loss": 1.2555, "grad_norm": 0.9772472977638245, "learning_rate": 0.0002, "epoch": 4.685825621042377, "step": 4810}, {"loss": 1.2667, "grad_norm": 0.9082779288291931, "learning_rate": 0.0002, "epoch": 4.695567462250366, "step": 4820}, {"loss": 1.2465, "grad_norm": 0.8026862740516663, "learning_rate": 0.0002, "epoch": 4.705309303458353, "step": 4830}, {"loss": 1.3369, "grad_norm": 1.1631089448928833, "learning_rate": 0.0002, "epoch": 4.715051144666342, "step": 4840}, {"loss": 1.261, "grad_norm": 0.9384787678718567, "learning_rate": 0.0002, "epoch": 4.7247929858743305, "step": 4850}, {"loss": 1.2588, "grad_norm": 1.2151581048965454, "learning_rate": 0.0002, "epoch": 4.734534827082318, "step": 4860}, {"loss": 1.363, "grad_norm": 0.9679436087608337, "learning_rate": 0.0002, "epoch": 4.744276668290307, "step": 4870}, {"loss": 1.3292, "grad_norm": 0.8352158069610596, "learning_rate": 0.0002, "epoch": 4.754018509498295, "step": 4880}, {"loss": 1.3056, "grad_norm": 1.0205804109573364, "learning_rate": 0.0002, "epoch": 4.763760350706283, "step": 4890}, {"loss": 1.223, "grad_norm": 0.9814772605895996, "learning_rate": 0.0002, "epoch": 4.773502191914272, "step": 4900}, {"loss": 1.3114, "grad_norm": 1.002854347229004, "learning_rate": 0.0002, "epoch": 4.78324403312226, "step": 4910}, {"loss": 1.3143, "grad_norm": 1.1609505414962769, "learning_rate": 0.0002, "epoch": 4.792985874330248, "step": 4920}, {"loss": 1.3166, "grad_norm": 0.9354982376098633, "learning_rate": 0.0002, "epoch": 4.802727715538237, "step": 4930}, {"loss": 1.2978, "grad_norm": 0.9761685729026794, "learning_rate": 0.0002, "epoch": 4.812469556746225, "step": 4940}, {"loss": 1.2709, "grad_norm": 1.0604596138000488, "learning_rate": 0.0002, "epoch": 4.822211397954213, "step": 4950}, {"loss": 1.2765, "grad_norm": 1.0902808904647827, "learning_rate": 0.0002, "epoch": 4.8319532391622015, "step": 4960}, {"loss": 1.3073, "grad_norm": 1.0174955129623413, "learning_rate": 0.0002, "epoch": 4.84169508037019, "step": 4970}, {"loss": 1.3141, "grad_norm": 1.0995253324508667, "learning_rate": 0.0002, "epoch": 4.851436921578179, "step": 4980}, {"loss": 1.3006, "grad_norm": 0.880993127822876, "learning_rate": 0.0002, "epoch": 4.8611787627861665, "step": 4990}, {"loss": 1.2547, "grad_norm": 0.9472237825393677, "learning_rate": 0.0002, "epoch": 4.870920603994155, "step": 5000}, {"loss": 1.4078, "grad_norm": 0.9504236578941345, "learning_rate": 0.0002, "epoch": 4.880662445202143, "step": 5010}, {"loss": 1.2791, "grad_norm": 1.1261742115020752, "learning_rate": 0.0002, "epoch": 4.890404286410131, "step": 5020}, {"loss": 1.3707, "grad_norm": 0.904674768447876, "learning_rate": 0.0002, "epoch": 4.90014612761812, "step": 5030}, {"loss": 1.2762, "grad_norm": 0.8828991055488586, "learning_rate": 0.0002, "epoch": 4.909887968826109, "step": 5040}, {"loss": 1.2905, "grad_norm": 1.0156532526016235, "learning_rate": 0.0002, "epoch": 4.919629810034096, "step": 5050}, {"loss": 1.3079, "grad_norm": 0.8975168466567993, "learning_rate": 0.0002, "epoch": 4.929371651242085, "step": 5060}, {"loss": 1.3322, "grad_norm": 0.9787213802337646, "learning_rate": 0.0002, "epoch": 4.939113492450073, "step": 5070}, {"loss": 1.2533, "grad_norm": 1.0801568031311035, "learning_rate": 0.0002, "epoch": 4.948855333658061, "step": 5080}, {"loss": 1.238, "grad_norm": 1.0655089616775513, "learning_rate": 0.0002, "epoch": 4.95859717486605, "step": 5090}, {"loss": 1.2449, "grad_norm": 0.8941320180892944, "learning_rate": 0.0002, "epoch": 4.968339016074038, "step": 5100}, {"loss": 1.2846, "grad_norm": 1.050621747970581, "learning_rate": 0.0002, "epoch": 4.978080857282026, "step": 5110}, {"loss": 1.3791, "grad_norm": 0.9724781513214111, "learning_rate": 0.0002, "epoch": 4.987822698490015, "step": 5120}, {"loss": 1.292, "grad_norm": 0.9850538969039917, "learning_rate": 0.0002, "epoch": 4.997564539698003, "step": 5130}, {"eval_loss": 2.0824170112609863, "eval_runtime": 55.592, "eval_samples_per_second": 9.12, "eval_steps_per_second": 1.151, "epoch": 4.9995129079396, "step": 5132}, {"loss": 1.037, "grad_norm": 1.0096189975738525, "learning_rate": 0.0002, "epoch": 5.007306380905991, "step": 5140}, {"loss": 1.0003, "grad_norm": 1.2403408288955688, "learning_rate": 0.0002, "epoch": 5.01704822211398, "step": 5150}, {"loss": 1.0129, "grad_norm": 1.1243221759796143, "learning_rate": 0.0002, "epoch": 5.026790063321968, "step": 5160}, {"loss": 0.9815, "grad_norm": 1.4745502471923828, "learning_rate": 0.0002, "epoch": 5.036531904529956, "step": 5170}, {"loss": 0.9715, "grad_norm": 1.1913198232650757, "learning_rate": 0.0002, "epoch": 5.0462737457379445, "step": 5180}, {"loss": 0.9282, "grad_norm": 1.2732855081558228, "learning_rate": 0.0002, "epoch": 5.056015586945933, "step": 5190}, {"loss": 0.9857, "grad_norm": 1.1737396717071533, "learning_rate": 0.0002, "epoch": 5.065757428153921, "step": 5200}, {"loss": 0.9754, "grad_norm": 1.4162768125534058, "learning_rate": 0.0002, "epoch": 5.075499269361909, "step": 5210}, {"loss": 1.0333, "grad_norm": 1.528274655342102, "learning_rate": 0.0002, "epoch": 5.085241110569898, "step": 5220}, {"loss": 1.0227, "grad_norm": 1.3966618776321411, "learning_rate": 0.0002, "epoch": 5.094982951777886, "step": 5230}, {"loss": 0.987, "grad_norm": 1.3427953720092773, "learning_rate": 0.0002, "epoch": 5.104724792985874, "step": 5240}, {"loss": 1.0353, "grad_norm": 1.6533905267715454, "learning_rate": 0.0002, "epoch": 5.114466634193863, "step": 5250}, {"loss": 1.0452, "grad_norm": 1.4114865064620972, "learning_rate": 0.0002, "epoch": 5.124208475401851, "step": 5260}, {"loss": 1.067, "grad_norm": 1.5460708141326904, "learning_rate": 0.0002, "epoch": 5.133950316609839, "step": 5270}, {"loss": 1.0667, "grad_norm": 1.3491919040679932, "learning_rate": 0.0002, "epoch": 5.143692157817828, "step": 5280}, {"loss": 0.9957, "grad_norm": 1.2208969593048096, "learning_rate": 0.0002, "epoch": 5.153433999025816, "step": 5290}, {"loss": 1.0362, "grad_norm": 1.1141403913497925, "learning_rate": 0.0002, "epoch": 5.163175840233804, "step": 5300}, {"loss": 0.9744, "grad_norm": 1.2938064336776733, "learning_rate": 0.0002, "epoch": 5.172917681441793, "step": 5310}, {"loss": 1.0438, "grad_norm": 1.2704918384552002, "learning_rate": 0.0002, "epoch": 5.1826595226497805, "step": 5320}, {"loss": 1.0015, "grad_norm": 1.3928544521331787, "learning_rate": 0.0002, "epoch": 5.192401363857769, "step": 5330}, {"loss": 1.025, "grad_norm": 1.1993824243545532, "learning_rate": 0.0002, "epoch": 5.202143205065758, "step": 5340}, {"loss": 1.0195, "grad_norm": 1.5913670063018799, "learning_rate": 0.0002, "epoch": 5.211885046273745, "step": 5350}, {"loss": 1.0113, "grad_norm": 1.1577855348587036, "learning_rate": 0.0002, "epoch": 5.221626887481734, "step": 5360}, {"loss": 1.0684, "grad_norm": 1.4535993337631226, "learning_rate": 0.0002, "epoch": 5.231368728689723, "step": 5370}, {"loss": 1.0255, "grad_norm": 1.5068976879119873, "learning_rate": 0.0002, "epoch": 5.24111056989771, "step": 5380}, {"loss": 1.0068, "grad_norm": 1.2365459203720093, "learning_rate": 0.0002, "epoch": 5.250852411105699, "step": 5390}, {"loss": 1.0145, "grad_norm": 1.3197922706604004, "learning_rate": 0.0002, "epoch": 5.2605942523136875, "step": 5400}, {"loss": 1.0767, "grad_norm": 1.2395117282867432, "learning_rate": 0.0002, "epoch": 5.270336093521675, "step": 5410}, {"loss": 1.0292, "grad_norm": 1.1841236352920532, "learning_rate": 0.0002, "epoch": 5.280077934729664, "step": 5420}, {"loss": 1.0233, "grad_norm": 1.218003749847412, "learning_rate": 0.0002, "epoch": 5.289819775937652, "step": 5430}, {"loss": 1.0093, "grad_norm": 1.2210947275161743, "learning_rate": 0.0002, "epoch": 5.29956161714564, "step": 5440}, {"loss": 0.9619, "grad_norm": 1.266006588935852, "learning_rate": 0.0002, "epoch": 5.309303458353629, "step": 5450}, {"loss": 1.0352, "grad_norm": 1.2598075866699219, "learning_rate": 0.0002, "epoch": 5.319045299561617, "step": 5460}, {"loss": 1.0929, "grad_norm": 1.2410019636154175, "learning_rate": 0.0002, "epoch": 5.328787140769606, "step": 5470}, {"loss": 1.058, "grad_norm": 1.249698519706726, "learning_rate": 0.0002, "epoch": 5.338528981977594, "step": 5480}, {"loss": 1.0457, "grad_norm": 1.2398173809051514, "learning_rate": 0.0002, "epoch": 5.348270823185582, "step": 5490}, {"loss": 1.0139, "grad_norm": 1.2416654825210571, "learning_rate": 0.0002, "epoch": 5.35801266439357, "step": 5500}, {"loss": 1.0609, "grad_norm": 1.398706316947937, "learning_rate": 0.0002, "epoch": 5.3677545056015585, "step": 5510}, {"loss": 1.0512, "grad_norm": 1.3049418926239014, "learning_rate": 0.0002, "epoch": 5.377496346809547, "step": 5520}, {"loss": 1.0912, "grad_norm": 1.2528893947601318, "learning_rate": 0.0002, "epoch": 5.387238188017536, "step": 5530}, {"loss": 1.0619, "grad_norm": 1.2963255643844604, "learning_rate": 0.0002, "epoch": 5.3969800292255234, "step": 5540}, {"loss": 1.0194, "grad_norm": 1.494231104850769, "learning_rate": 0.0002, "epoch": 5.406721870433512, "step": 5550}, {"loss": 1.0179, "grad_norm": 1.2760992050170898, "learning_rate": 0.0002, "epoch": 5.416463711641501, "step": 5560}, {"loss": 1.1088, "grad_norm": 1.195292592048645, "learning_rate": 0.0002, "epoch": 5.426205552849488, "step": 5570}, {"loss": 1.0859, "grad_norm": 1.6408965587615967, "learning_rate": 0.0002, "epoch": 5.435947394057477, "step": 5580}, {"loss": 1.0868, "grad_norm": 1.3092058897018433, "learning_rate": 0.0002, "epoch": 5.4456892352654656, "step": 5590}, {"loss": 1.006, "grad_norm": 1.2960586547851562, "learning_rate": 0.0002, "epoch": 5.455431076473453, "step": 5600}, {"loss": 1.0257, "grad_norm": 1.3560487031936646, "learning_rate": 0.0002, "epoch": 5.465172917681442, "step": 5610}, {"loss": 1.0314, "grad_norm": 1.1896311044692993, "learning_rate": 0.0002, "epoch": 5.4749147588894305, "step": 5620}, {"loss": 1.0435, "grad_norm": 1.3145595788955688, "learning_rate": 0.0002, "epoch": 5.484656600097418, "step": 5630}, {"loss": 1.0456, "grad_norm": 1.2207404375076294, "learning_rate": 0.0002, "epoch": 5.494398441305407, "step": 5640}, {"loss": 1.0823, "grad_norm": 1.266015887260437, "learning_rate": 0.0002, "epoch": 5.504140282513395, "step": 5650}, {"loss": 1.0696, "grad_norm": 1.2478289604187012, "learning_rate": 0.0002, "epoch": 5.513882123721383, "step": 5660}, {"loss": 1.0695, "grad_norm": 1.4851372241973877, "learning_rate": 0.0002, "epoch": 5.523623964929372, "step": 5670}, {"loss": 1.0736, "grad_norm": 1.4478679895401, "learning_rate": 0.0002, "epoch": 5.53336580613736, "step": 5680}, {"loss": 1.043, "grad_norm": 1.1079537868499756, "learning_rate": 0.0002, "epoch": 5.543107647345348, "step": 5690}, {"loss": 1.1107, "grad_norm": 1.4201879501342773, "learning_rate": 0.0002, "epoch": 5.552849488553337, "step": 5700}, {"loss": 1.0697, "grad_norm": 1.2092000246047974, "learning_rate": 0.0002, "epoch": 5.562591329761325, "step": 5710}, {"loss": 0.9868, "grad_norm": 1.4515851736068726, "learning_rate": 0.0002, "epoch": 5.572333170969313, "step": 5720}, {"loss": 1.1547, "grad_norm": 1.3260412216186523, "learning_rate": 0.0002, "epoch": 5.5820750121773015, "step": 5730}, {"loss": 1.1388, "grad_norm": 1.248191475868225, "learning_rate": 0.0002, "epoch": 5.59181685338529, "step": 5740}, {"loss": 1.0597, "grad_norm": 1.2037307024002075, "learning_rate": 0.0002, "epoch": 5.601558694593278, "step": 5750}, {"loss": 1.1425, "grad_norm": 1.341237187385559, "learning_rate": 0.0002, "epoch": 5.611300535801266, "step": 5760}, {"loss": 1.0942, "grad_norm": 1.130115270614624, "learning_rate": 0.0002, "epoch": 5.621042377009255, "step": 5770}, {"loss": 1.1029, "grad_norm": 1.3834772109985352, "learning_rate": 0.0002, "epoch": 5.630784218217243, "step": 5780}, {"loss": 1.0825, "grad_norm": 1.2586270570755005, "learning_rate": 0.0002, "epoch": 5.640526059425231, "step": 5790}, {"loss": 1.0186, "grad_norm": 1.3233023881912231, "learning_rate": 0.0002, "epoch": 5.65026790063322, "step": 5800}, {"loss": 1.0557, "grad_norm": 1.2711341381072998, "learning_rate": 0.0002, "epoch": 5.660009741841208, "step": 5810}, {"loss": 1.0897, "grad_norm": 1.3867720365524292, "learning_rate": 0.0002, "epoch": 5.669751583049196, "step": 5820}, {"loss": 1.0776, "grad_norm": 1.4783269166946411, "learning_rate": 0.0002, "epoch": 5.679493424257185, "step": 5830}, {"loss": 1.0632, "grad_norm": 1.2744768857955933, "learning_rate": 0.0002, "epoch": 5.6892352654651726, "step": 5840}, {"loss": 1.1484, "grad_norm": 1.3405882120132446, "learning_rate": 0.0002, "epoch": 5.698977106673161, "step": 5850}, {"loss": 1.0975, "grad_norm": 1.204300880432129, "learning_rate": 0.0002, "epoch": 5.70871894788115, "step": 5860}, {"loss": 1.0494, "grad_norm": 1.2954572439193726, "learning_rate": 0.0002, "epoch": 5.7184607890891375, "step": 5870}, {"loss": 1.0643, "grad_norm": 1.5478382110595703, "learning_rate": 0.0002, "epoch": 5.728202630297126, "step": 5880}, {"loss": 1.0582, "grad_norm": 1.2095842361450195, "learning_rate": 0.0002, "epoch": 5.737944471505115, "step": 5890}, {"loss": 1.1, "grad_norm": 1.0691519975662231, "learning_rate": 0.0002, "epoch": 5.747686312713103, "step": 5900}, {"loss": 1.0906, "grad_norm": 1.1920677423477173, "learning_rate": 0.0002, "epoch": 5.757428153921091, "step": 5910}, {"loss": 1.1746, "grad_norm": 1.2051277160644531, "learning_rate": 0.0002, "epoch": 5.76716999512908, "step": 5920}, {"loss": 1.1221, "grad_norm": 1.197490930557251, "learning_rate": 0.0002, "epoch": 5.776911836337067, "step": 5930}, {"loss": 1.07, "grad_norm": 1.2003998756408691, "learning_rate": 0.0002, "epoch": 5.786653677545056, "step": 5940}, {"loss": 1.0938, "grad_norm": 1.2323646545410156, "learning_rate": 0.0002, "epoch": 5.7963955187530445, "step": 5950}, {"loss": 1.1443, "grad_norm": 1.2593932151794434, "learning_rate": 0.0002, "epoch": 5.806137359961033, "step": 5960}, {"loss": 1.0829, "grad_norm": 1.1835976839065552, "learning_rate": 0.0002, "epoch": 5.815879201169021, "step": 5970}, {"loss": 1.1056, "grad_norm": 1.4770104885101318, "learning_rate": 0.0002, "epoch": 5.825621042377009, "step": 5980}, {"loss": 1.1934, "grad_norm": 1.1025809049606323, "learning_rate": 0.0002, "epoch": 5.835362883584997, "step": 5990}, {"loss": 1.1323, "grad_norm": 1.364588975906372, "learning_rate": 0.0002, "epoch": 5.845104724792986, "step": 6000}, {"loss": 1.1234, "grad_norm": 1.2340112924575806, "learning_rate": 0.0002, "epoch": 5.854846566000974, "step": 6010}, {"loss": 1.1123, "grad_norm": 1.4925711154937744, "learning_rate": 0.0002, "epoch": 5.864588407208963, "step": 6020}, {"loss": 1.12, "grad_norm": 1.3516744375228882, "learning_rate": 0.0002, "epoch": 5.874330248416951, "step": 6030}, {"loss": 1.1399, "grad_norm": 1.2058138847351074, "learning_rate": 0.0002, "epoch": 5.884072089624939, "step": 6040}, {"loss": 1.1074, "grad_norm": 1.13870108127594, "learning_rate": 0.0002, "epoch": 5.893813930832927, "step": 6050}, {"loss": 1.088, "grad_norm": 1.1587319374084473, "learning_rate": 0.0002, "epoch": 5.9035557720409155, "step": 6060}, {"loss": 1.1376, "grad_norm": 1.164481520652771, "learning_rate": 0.0002, "epoch": 5.913297613248904, "step": 6070}, {"loss": 1.1262, "grad_norm": 1.2115206718444824, "learning_rate": 0.0002, "epoch": 5.923039454456893, "step": 6080}, {"loss": 1.1345, "grad_norm": 1.3201590776443481, "learning_rate": 0.0002, "epoch": 5.93278129566488, "step": 6090}, {"loss": 1.1288, "grad_norm": 1.287380576133728, "learning_rate": 0.0002, "epoch": 5.942523136872869, "step": 6100}, {"loss": 1.1475, "grad_norm": 1.1820166110992432, "learning_rate": 0.0002, "epoch": 5.952264978080858, "step": 6110}, {"loss": 1.1112, "grad_norm": 1.2550667524337769, "learning_rate": 0.0002, "epoch": 5.962006819288845, "step": 6120}, {"loss": 1.1528, "grad_norm": 1.3547813892364502, "learning_rate": 0.0002, "epoch": 5.971748660496834, "step": 6130}, {"loss": 1.0557, "grad_norm": 1.260842204093933, "learning_rate": 0.0002, "epoch": 5.9814905017048225, "step": 6140}, {"loss": 1.1119, "grad_norm": 1.1643036603927612, "learning_rate": 0.0002, "epoch": 5.99123234291281, "step": 6150}, {"eval_loss": 2.2628161907196045, "eval_runtime": 57.2379, "eval_samples_per_second": 8.858, "eval_steps_per_second": 1.118, "epoch": 6.0, "step": 6159}, {"loss": 1.0837, "grad_norm": 0.9384723901748657, "learning_rate": 0.0002, "epoch": 6.000974184120799, "step": 6160}, {"loss": 0.7335, "grad_norm": 2.1525821685791016, "learning_rate": 0.0002, "epoch": 6.0107160253287875, "step": 6170}, {"loss": 0.8416, "grad_norm": 2.0194077491760254, "learning_rate": 0.0002, "epoch": 6.020457866536775, "step": 6180}, {"loss": 0.8443, "grad_norm": 1.5257816314697266, "learning_rate": 0.0002, "epoch": 6.030199707744764, "step": 6190}, {"loss": 0.7543, "grad_norm": 1.5432662963867188, "learning_rate": 0.0002, "epoch": 6.039941548952752, "step": 6200}, {"loss": 0.8104, "grad_norm": 1.6874405145645142, "learning_rate": 0.0002, "epoch": 6.04968339016074, "step": 6210}, {"loss": 0.8395, "grad_norm": 1.7346407175064087, "learning_rate": 0.0002, "epoch": 6.059425231368729, "step": 6220}, {"loss": 0.8027, "grad_norm": 1.5320781469345093, "learning_rate": 0.0002, "epoch": 6.069167072576717, "step": 6230}, {"loss": 0.7488, "grad_norm": 1.4106669425964355, "learning_rate": 0.0002, "epoch": 6.078908913784705, "step": 6240}, {"loss": 0.812, "grad_norm": 1.5568628311157227, "learning_rate": 0.0002, "epoch": 6.088650754992694, "step": 6250}, {"loss": 0.8055, "grad_norm": 1.6155978441238403, "learning_rate": 0.0002, "epoch": 6.098392596200682, "step": 6260}, {"loss": 0.8225, "grad_norm": 1.4820445775985718, "learning_rate": 0.0002, "epoch": 6.10813443740867, "step": 6270}, {"loss": 0.8599, "grad_norm": 1.6163820028305054, "learning_rate": 0.0002, "epoch": 6.1178762786166585, "step": 6280}, {"loss": 0.853, "grad_norm": 1.8396387100219727, "learning_rate": 0.0002, "epoch": 6.127618119824647, "step": 6290}, {"loss": 0.7768, "grad_norm": 1.7181230783462524, "learning_rate": 0.0002, "epoch": 6.137359961032635, "step": 6300}, {"loss": 0.8116, "grad_norm": 1.6568509340286255, "learning_rate": 0.0002, "epoch": 6.147101802240623, "step": 6310}, {"loss": 0.8525, "grad_norm": 1.3481947183609009, "learning_rate": 0.0002, "epoch": 6.156843643448612, "step": 6320}, {"loss": 0.762, "grad_norm": 1.5788342952728271, "learning_rate": 0.0002, "epoch": 6.1665854846566, "step": 6330}, {"loss": 0.886, "grad_norm": 1.5067620277404785, "learning_rate": 0.0002, "epoch": 6.176327325864588, "step": 6340}, {"loss": 0.8375, "grad_norm": 1.8198208808898926, "learning_rate": 0.0002, "epoch": 6.186069167072577, "step": 6350}, {"loss": 0.7867, "grad_norm": 1.4012749195098877, "learning_rate": 0.0002, "epoch": 6.195811008280565, "step": 6360}, {"loss": 0.8144, "grad_norm": 1.759798288345337, "learning_rate": 0.0002, "epoch": 6.205552849488553, "step": 6370}, {"loss": 0.7811, "grad_norm": 1.468922734260559, "learning_rate": 0.0002, "epoch": 6.215294690696542, "step": 6380}, {"loss": 0.8356, "grad_norm": 1.3706471920013428, "learning_rate": 0.0002, "epoch": 6.2250365319045295, "step": 6390}, {"loss": 0.8096, "grad_norm": 1.6397383213043213, "learning_rate": 0.0002, "epoch": 6.234778373112518, "step": 6400}, {"loss": 0.8834, "grad_norm": 1.5614187717437744, "learning_rate": 0.0002, "epoch": 6.244520214320507, "step": 6410}, {"loss": 0.8533, "grad_norm": 1.7118678092956543, "learning_rate": 0.0002, "epoch": 6.2542620555284945, "step": 6420}, {"loss": 0.8653, "grad_norm": 1.4041547775268555, "learning_rate": 0.0002, "epoch": 6.264003896736483, "step": 6430}, {"loss": 0.879, "grad_norm": 1.7653605937957764, "learning_rate": 0.0002, "epoch": 6.273745737944472, "step": 6440}, {"loss": 0.8786, "grad_norm": 2.6219191551208496, "learning_rate": 0.0002, "epoch": 6.28348757915246, "step": 6450}, {"loss": 0.8896, "grad_norm": 1.4757837057113647, "learning_rate": 0.0002, "epoch": 6.293229420360448, "step": 6460}, {"loss": 0.9079, "grad_norm": 1.715598225593567, "learning_rate": 0.0002, "epoch": 6.302971261568437, "step": 6470}, {"loss": 0.8526, "grad_norm": 1.376216173171997, "learning_rate": 0.0002, "epoch": 6.312713102776424, "step": 6480}, {"loss": 0.8742, "grad_norm": 1.7119828462600708, "learning_rate": 0.0002, "epoch": 6.322454943984413, "step": 6490}, {"loss": 0.7988, "grad_norm": 1.4304355382919312, "learning_rate": 0.0002, "epoch": 6.3321967851924015, "step": 6500}, {"loss": 0.8539, "grad_norm": 1.4889872074127197, "learning_rate": 0.0002, "epoch": 6.34193862640039, "step": 6510}, {"loss": 0.9328, "grad_norm": 1.370373010635376, "learning_rate": 0.0002, "epoch": 6.351680467608378, "step": 6520}, {"loss": 0.8997, "grad_norm": 1.7697709798812866, "learning_rate": 0.0002, "epoch": 6.361422308816366, "step": 6530}, {"loss": 0.9421, "grad_norm": 1.495297908782959, "learning_rate": 0.0002, "epoch": 6.371164150024355, "step": 6540}, {"loss": 0.8796, "grad_norm": 1.7251347303390503, "learning_rate": 0.0002, "epoch": 6.380905991232343, "step": 6550}, {"loss": 0.9327, "grad_norm": 1.6909505128860474, "learning_rate": 0.0002, "epoch": 6.390647832440331, "step": 6560}, {"loss": 0.837, "grad_norm": 1.4369314908981323, "learning_rate": 0.0002, "epoch": 6.40038967364832, "step": 6570}, {"loss": 0.8572, "grad_norm": 1.7803739309310913, "learning_rate": 0.0002, "epoch": 6.410131514856308, "step": 6580}, {"loss": 0.9024, "grad_norm": 1.6107097864151, "learning_rate": 0.0002, "epoch": 6.419873356064296, "step": 6590}, {"loss": 0.8469, "grad_norm": 1.6151643991470337, "learning_rate": 0.0002, "epoch": 6.429615197272285, "step": 6600}, {"loss": 0.8791, "grad_norm": 1.7159833908081055, "learning_rate": 0.0002, "epoch": 6.4393570384802725, "step": 6610}, {"loss": 0.9249, "grad_norm": 1.4366064071655273, "learning_rate": 0.0002, "epoch": 6.449098879688261, "step": 6620}, {"loss": 0.8417, "grad_norm": 1.6050453186035156, "learning_rate": 0.0002, "epoch": 6.45884072089625, "step": 6630}, {"loss": 0.8943, "grad_norm": 1.6296740770339966, "learning_rate": 0.0002, "epoch": 6.468582562104237, "step": 6640}, {"loss": 0.9228, "grad_norm": 1.6181174516677856, "learning_rate": 0.0002, "epoch": 6.478324403312226, "step": 6650}, {"loss": 0.9139, "grad_norm": 1.5452176332473755, "learning_rate": 0.0002, "epoch": 6.488066244520215, "step": 6660}, {"loss": 0.9022, "grad_norm": 1.3919731378555298, "learning_rate": 0.0002, "epoch": 6.497808085728202, "step": 6670}, {"loss": 0.9046, "grad_norm": 1.6456257104873657, "learning_rate": 0.0002, "epoch": 6.507549926936191, "step": 6680}, {"loss": 0.9041, "grad_norm": 1.4147369861602783, "learning_rate": 0.0002, "epoch": 6.5172917681441795, "step": 6690}, {"loss": 0.8361, "grad_norm": 1.7005025148391724, "learning_rate": 0.0002, "epoch": 6.527033609352167, "step": 6700}, {"loss": 0.8738, "grad_norm": 1.6032357215881348, "learning_rate": 0.0002, "epoch": 6.536775450560156, "step": 6710}, {"loss": 0.9796, "grad_norm": 1.3454229831695557, "learning_rate": 0.0002, "epoch": 6.5465172917681445, "step": 6720}, {"loss": 0.8573, "grad_norm": 1.6961418390274048, "learning_rate": 0.0002, "epoch": 6.556259132976132, "step": 6730}, {"loss": 0.9241, "grad_norm": 1.78407883644104, "learning_rate": 0.0002, "epoch": 6.566000974184121, "step": 6740}, {"loss": 0.8941, "grad_norm": 1.6817889213562012, "learning_rate": 0.0002, "epoch": 6.575742815392109, "step": 6750}, {"loss": 0.8765, "grad_norm": 1.7894943952560425, "learning_rate": 0.0002, "epoch": 6.585484656600097, "step": 6760}, {"loss": 0.8607, "grad_norm": 1.6404837369918823, "learning_rate": 0.0002, "epoch": 6.595226497808086, "step": 6770}, {"loss": 0.8573, "grad_norm": 1.5849255323410034, "learning_rate": 0.0002, "epoch": 6.604968339016074, "step": 6780}, {"loss": 0.9575, "grad_norm": 1.5993813276290894, "learning_rate": 0.0002, "epoch": 6.614710180224062, "step": 6790}, {"loss": 0.8922, "grad_norm": 1.2834863662719727, "learning_rate": 0.0002, "epoch": 6.624452021432051, "step": 6800}, {"loss": 0.9007, "grad_norm": 1.7215641736984253, "learning_rate": 0.0002, "epoch": 6.634193862640039, "step": 6810}, {"loss": 0.9292, "grad_norm": 1.7588146924972534, "learning_rate": 0.0002, "epoch": 6.643935703848027, "step": 6820}, {"loss": 0.8634, "grad_norm": 1.7956023216247559, "learning_rate": 0.0002, "epoch": 6.6536775450560155, "step": 6830}, {"loss": 0.8108, "grad_norm": 1.5115351676940918, "learning_rate": 0.0002, "epoch": 6.663419386264004, "step": 6840}, {"loss": 0.9329, "grad_norm": 1.5660319328308105, "learning_rate": 0.0002, "epoch": 6.673161227471992, "step": 6850}, {"loss": 0.9877, "grad_norm": 1.4323679208755493, "learning_rate": 0.0002, "epoch": 6.68290306867998, "step": 6860}, {"loss": 0.8732, "grad_norm": 1.662089467048645, "learning_rate": 0.0002, "epoch": 6.692644909887969, "step": 6870}, {"loss": 0.87, "grad_norm": 1.7854869365692139, "learning_rate": 0.0002, "epoch": 6.702386751095958, "step": 6880}, {"loss": 0.9105, "grad_norm": 1.5491222143173218, "learning_rate": 0.0002, "epoch": 6.712128592303945, "step": 6890}, {"loss": 0.9147, "grad_norm": 1.5946987867355347, "learning_rate": 0.0002, "epoch": 6.721870433511934, "step": 6900}, {"loss": 0.9391, "grad_norm": 1.6195964813232422, "learning_rate": 0.0002, "epoch": 6.731612274719922, "step": 6910}, {"loss": 0.8947, "grad_norm": 1.6366901397705078, "learning_rate": 0.0002, "epoch": 6.74135411592791, "step": 6920}, {"loss": 0.8695, "grad_norm": 1.5080382823944092, "learning_rate": 0.0002, "epoch": 6.751095957135899, "step": 6930}, {"loss": 0.9124, "grad_norm": 1.742353916168213, "learning_rate": 0.0002, "epoch": 6.760837798343887, "step": 6940}, {"loss": 0.9118, "grad_norm": 1.690251111984253, "learning_rate": 0.0002, "epoch": 6.770579639551875, "step": 6950}, {"loss": 0.9039, "grad_norm": 1.7103357315063477, "learning_rate": 0.0002, "epoch": 6.780321480759864, "step": 6960}, {"loss": 0.869, "grad_norm": 1.6630914211273193, "learning_rate": 0.0002, "epoch": 6.7900633219678515, "step": 6970}, {"loss": 0.8944, "grad_norm": 1.423768162727356, "learning_rate": 0.0002, "epoch": 6.79980516317584, "step": 6980}, {"loss": 0.9397, "grad_norm": 1.7844693660736084, "learning_rate": 0.0002, "epoch": 6.809547004383829, "step": 6990}, {"loss": 0.8889, "grad_norm": 1.545282006263733, "learning_rate": 0.0002, "epoch": 6.819288845591817, "step": 7000}, {"loss": 0.9333, "grad_norm": 1.4340319633483887, "learning_rate": 0.0002, "epoch": 6.829030686799805, "step": 7010}, {"loss": 0.9486, "grad_norm": 1.5981626510620117, "learning_rate": 0.0002, "epoch": 6.838772528007794, "step": 7020}, {"loss": 0.9062, "grad_norm": 1.5205026865005493, "learning_rate": 0.0002, "epoch": 6.848514369215782, "step": 7030}, {"loss": 0.9245, "grad_norm": 1.6999989748001099, "learning_rate": 0.0002, "epoch": 6.85825621042377, "step": 7040}, {"loss": 0.9313, "grad_norm": 1.6392347812652588, "learning_rate": 0.0002, "epoch": 6.8679980516317585, "step": 7050}, {"loss": 0.9275, "grad_norm": 1.637308955192566, "learning_rate": 0.0002, "epoch": 6.877739892839747, "step": 7060}, {"loss": 0.9672, "grad_norm": 1.671341896057129, "learning_rate": 0.0002, "epoch": 6.887481734047735, "step": 7070}, {"loss": 0.9726, "grad_norm": 1.4437555074691772, "learning_rate": 0.0002, "epoch": 6.897223575255723, "step": 7080}, {"loss": 0.9454, "grad_norm": 1.4251935482025146, "learning_rate": 0.0002, "epoch": 6.906965416463712, "step": 7090}, {"loss": 0.8858, "grad_norm": 1.5106734037399292, "learning_rate": 0.0002, "epoch": 6.9167072576717, "step": 7100}, {"loss": 0.939, "grad_norm": 1.670742154121399, "learning_rate": 0.0002, "epoch": 6.926449098879688, "step": 7110}, {"loss": 0.8818, "grad_norm": 1.4353723526000977, "learning_rate": 0.0002, "epoch": 6.936190940087677, "step": 7120}, {"loss": 0.9354, "grad_norm": 1.9437772035598755, "learning_rate": 0.0002, "epoch": 6.945932781295665, "step": 7130}, {"loss": 0.9623, "grad_norm": 1.4922038316726685, "learning_rate": 0.0002, "epoch": 6.955674622503653, "step": 7140}, {"loss": 0.9653, "grad_norm": 1.489193081855774, "learning_rate": 0.0002, "epoch": 6.965416463711642, "step": 7150}, {"loss": 1.0024, "grad_norm": 1.529490351676941, "learning_rate": 0.0002, "epoch": 6.9751583049196295, "step": 7160}, {"loss": 0.9715, "grad_norm": 1.7370105981826782, "learning_rate": 0.0002, "epoch": 6.984900146127618, "step": 7170}, {"loss": 0.921, "grad_norm": 1.5639604330062866, "learning_rate": 0.0002, "epoch": 6.994641987335607, "step": 7180}, {"eval_loss": 2.521758794784546, "eval_runtime": 56.1587, "eval_samples_per_second": 9.028, "eval_steps_per_second": 1.14, "epoch": 6.9995129079396, "step": 7185}, {"loss": 0.8682, "grad_norm": 1.391621470451355, "learning_rate": 0.0002, "epoch": 7.004383828543594, "step": 7190}, {"loss": 0.6591, "grad_norm": 2.3696491718292236, "learning_rate": 0.0002, "epoch": 7.014125669751583, "step": 7200}, {"loss": 0.5653, "grad_norm": 1.6873828172683716, "learning_rate": 0.0002, "epoch": 7.023867510959572, "step": 7210}, {"loss": 0.6301, "grad_norm": 1.8893300294876099, "learning_rate": 0.0002, "epoch": 7.033609352167559, "step": 7220}, {"loss": 0.5653, "grad_norm": 1.6323082447052002, "learning_rate": 0.0002, "epoch": 7.043351193375548, "step": 7230}, {"loss": 0.6402, "grad_norm": 1.9979127645492554, "learning_rate": 0.0002, "epoch": 7.0530930345835365, "step": 7240}, {"loss": 0.6314, "grad_norm": 2.0339183807373047, "learning_rate": 0.0002, "epoch": 7.062834875791524, "step": 7250}, {"loss": 0.615, "grad_norm": 1.6820781230926514, "learning_rate": 0.0002, "epoch": 7.072576716999513, "step": 7260}, {"loss": 0.7023, "grad_norm": 2.0400710105895996, "learning_rate": 0.0002, "epoch": 7.0823185582075014, "step": 7270}, {"loss": 0.6003, "grad_norm": 2.13495135307312, "learning_rate": 0.0002, "epoch": 7.092060399415489, "step": 7280}, {"loss": 0.6243, "grad_norm": 1.6993554830551147, "learning_rate": 0.0002, "epoch": 7.101802240623478, "step": 7290}, {"loss": 0.6587, "grad_norm": 1.9262464046478271, "learning_rate": 0.0002, "epoch": 7.111544081831466, "step": 7300}, {"loss": 0.6445, "grad_norm": 1.8407244682312012, "learning_rate": 0.0002, "epoch": 7.121285923039454, "step": 7310}, {"loss": 0.6305, "grad_norm": 1.744294285774231, "learning_rate": 0.0002, "epoch": 7.131027764247443, "step": 7320}, {"loss": 0.6886, "grad_norm": 1.7602320909500122, "learning_rate": 0.0002, "epoch": 7.140769605455431, "step": 7330}, {"loss": 0.6582, "grad_norm": 1.7360851764678955, "learning_rate": 0.0002, "epoch": 7.150511446663419, "step": 7340}, {"loss": 0.6172, "grad_norm": 2.0012850761413574, "learning_rate": 0.0002, "epoch": 7.160253287871408, "step": 7350}, {"loss": 0.6636, "grad_norm": 2.064319372177124, "learning_rate": 0.0002, "epoch": 7.169995129079396, "step": 7360}, {"loss": 0.6273, "grad_norm": 1.4556169509887695, "learning_rate": 0.0002, "epoch": 7.179736970287384, "step": 7370}, {"loss": 0.6944, "grad_norm": 2.365649938583374, "learning_rate": 0.0002, "epoch": 7.1894788114953725, "step": 7380}, {"loss": 0.6572, "grad_norm": 1.8271889686584473, "learning_rate": 0.0002, "epoch": 7.199220652703361, "step": 7390}, {"loss": 0.6955, "grad_norm": 1.9143747091293335, "learning_rate": 0.0002, "epoch": 7.208962493911349, "step": 7400}, {"loss": 0.6811, "grad_norm": 1.5670185089111328, "learning_rate": 0.0002, "epoch": 7.218704335119337, "step": 7410}, {"loss": 0.7035, "grad_norm": 1.7452768087387085, "learning_rate": 0.0002, "epoch": 7.228446176327326, "step": 7420}, {"loss": 0.6848, "grad_norm": 1.7830921411514282, "learning_rate": 0.0002, "epoch": 7.238188017535315, "step": 7430}, {"loss": 0.7018, "grad_norm": 1.9281501770019531, "learning_rate": 0.0002, "epoch": 7.247929858743302, "step": 7440}, {"loss": 0.6802, "grad_norm": 1.889663815498352, "learning_rate": 0.0002, "epoch": 7.257671699951291, "step": 7450}, {"loss": 0.6457, "grad_norm": 1.704999566078186, "learning_rate": 0.0002, "epoch": 7.2674135411592795, "step": 7460}, {"loss": 0.6919, "grad_norm": 1.824109435081482, "learning_rate": 0.0002, "epoch": 7.277155382367267, "step": 7470}, {"loss": 0.6814, "grad_norm": 1.5378915071487427, "learning_rate": 0.0002, "epoch": 7.286897223575256, "step": 7480}, {"loss": 0.7257, "grad_norm": 1.830587387084961, "learning_rate": 0.0002, "epoch": 7.296639064783244, "step": 7490}, {"loss": 0.7036, "grad_norm": 2.0029330253601074, "learning_rate": 0.0002, "epoch": 7.306380905991232, "step": 7500}, {"loss": 0.662, "grad_norm": 2.0871448516845703, "learning_rate": 0.0002, "epoch": 7.316122747199221, "step": 7510}, {"loss": 0.6856, "grad_norm": 1.8416074514389038, "learning_rate": 0.0002, "epoch": 7.325864588407209, "step": 7520}, {"loss": 0.688, "grad_norm": 1.8962771892547607, "learning_rate": 0.0002, "epoch": 7.335606429615197, "step": 7530}, {"loss": 0.6935, "grad_norm": 1.899487018585205, "learning_rate": 0.0002, "epoch": 7.345348270823186, "step": 7540}, {"loss": 0.667, "grad_norm": 1.8300765752792358, "learning_rate": 0.0002, "epoch": 7.355090112031174, "step": 7550}, {"loss": 0.7012, "grad_norm": 2.178112268447876, "learning_rate": 0.0002, "epoch": 7.364831953239162, "step": 7560}, {"loss": 0.6172, "grad_norm": 1.8472180366516113, "learning_rate": 0.0002, "epoch": 7.3745737944471506, "step": 7570}, {"loss": 0.6944, "grad_norm": 1.7787587642669678, "learning_rate": 0.0002, "epoch": 7.384315635655139, "step": 7580}, {"loss": 0.7236, "grad_norm": 1.8309564590454102, "learning_rate": 0.0002, "epoch": 7.394057476863127, "step": 7590}, {"loss": 0.7067, "grad_norm": 2.028923273086548, "learning_rate": 0.0002, "epoch": 7.4037993180711155, "step": 7600}, {"loss": 0.6656, "grad_norm": 1.7393525838851929, "learning_rate": 0.0002, "epoch": 7.413541159279104, "step": 7610}, {"loss": 0.6958, "grad_norm": 1.8816498517990112, "learning_rate": 0.0002, "epoch": 7.423283000487092, "step": 7620}, {"loss": 0.6666, "grad_norm": 2.4553585052490234, "learning_rate": 0.0002, "epoch": 7.43302484169508, "step": 7630}, {"loss": 0.6964, "grad_norm": 1.9045933485031128, "learning_rate": 0.0002, "epoch": 7.442766682903069, "step": 7640}, {"loss": 0.6759, "grad_norm": 1.664156198501587, "learning_rate": 0.0002, "epoch": 7.452508524111057, "step": 7650}, {"loss": 0.7777, "grad_norm": 1.792748212814331, "learning_rate": 0.0002, "epoch": 7.462250365319045, "step": 7660}, {"loss": 0.6785, "grad_norm": 1.8481247425079346, "learning_rate": 0.0002, "epoch": 7.471992206527034, "step": 7670}, {"loss": 0.7156, "grad_norm": 2.0541393756866455, "learning_rate": 0.0002, "epoch": 7.481734047735022, "step": 7680}, {"loss": 0.7126, "grad_norm": 1.594969630241394, "learning_rate": 0.0002, "epoch": 7.49147588894301, "step": 7690}, {"loss": 0.7042, "grad_norm": 2.1409924030303955, "learning_rate": 0.0002, "epoch": 7.501217730150999, "step": 7700}, {"loss": 0.6691, "grad_norm": 1.9743319749832153, "learning_rate": 0.0002, "epoch": 7.5109595713589865, "step": 7710}, {"loss": 0.677, "grad_norm": 1.866410493850708, "learning_rate": 0.0002, "epoch": 7.520701412566975, "step": 7720}, {"loss": 0.7159, "grad_norm": 1.9087774753570557, "learning_rate": 0.0002, "epoch": 7.530443253774964, "step": 7730}, {"loss": 0.7564, "grad_norm": 1.8624005317687988, "learning_rate": 0.0002, "epoch": 7.540185094982951, "step": 7740}, {"loss": 0.7355, "grad_norm": 1.629889726638794, "learning_rate": 0.0002, "epoch": 7.54992693619094, "step": 7750}, {"loss": 0.8001, "grad_norm": 2.1364638805389404, "learning_rate": 0.0002, "epoch": 7.559668777398929, "step": 7760}, {"loss": 0.7878, "grad_norm": 1.591701865196228, "learning_rate": 0.0002, "epoch": 7.569410618606916, "step": 7770}, {"loss": 0.7484, "grad_norm": 2.3200602531433105, "learning_rate": 0.0002, "epoch": 7.579152459814905, "step": 7780}, {"loss": 0.7407, "grad_norm": 1.9998793601989746, "learning_rate": 0.0002, "epoch": 7.5888943010228935, "step": 7790}, {"loss": 0.7436, "grad_norm": 1.8921900987625122, "learning_rate": 0.0002, "epoch": 7.598636142230882, "step": 7800}, {"loss": 0.6898, "grad_norm": 1.8826839923858643, "learning_rate": 0.0002, "epoch": 7.60837798343887, "step": 7810}, {"loss": 0.7376, "grad_norm": 1.8796452283859253, "learning_rate": 0.0002, "epoch": 7.618119824646858, "step": 7820}, {"loss": 0.7304, "grad_norm": 1.6528139114379883, "learning_rate": 0.0002, "epoch": 7.627861665854846, "step": 7830}, {"loss": 0.7205, "grad_norm": 1.9646536111831665, "learning_rate": 0.0002, "epoch": 7.637603507062835, "step": 7840}, {"loss": 0.7276, "grad_norm": 1.6951191425323486, "learning_rate": 0.0002, "epoch": 7.647345348270823, "step": 7850}, {"loss": 0.7476, "grad_norm": 1.8734302520751953, "learning_rate": 0.0002, "epoch": 7.657087189478812, "step": 7860}, {"loss": 0.729, "grad_norm": 2.140984058380127, "learning_rate": 0.0002, "epoch": 7.6668290306868, "step": 7870}, {"loss": 0.7979, "grad_norm": 1.8852670192718506, "learning_rate": 0.0002, "epoch": 7.676570871894788, "step": 7880}, {"loss": 0.745, "grad_norm": 2.1172003746032715, "learning_rate": 0.0002, "epoch": 7.686312713102776, "step": 7890}, {"loss": 0.7796, "grad_norm": 1.8237593173980713, "learning_rate": 0.0002, "epoch": 7.696054554310765, "step": 7900}, {"loss": 0.7203, "grad_norm": 2.1399245262145996, "learning_rate": 0.0002, "epoch": 7.705796395518753, "step": 7910}, {"loss": 0.741, "grad_norm": 1.8119547367095947, "learning_rate": 0.0002, "epoch": 7.715538236726742, "step": 7920}, {"loss": 0.7826, "grad_norm": 1.943442463874817, "learning_rate": 0.0002, "epoch": 7.7252800779347295, "step": 7930}, {"loss": 0.7635, "grad_norm": 1.6926734447479248, "learning_rate": 0.0002, "epoch": 7.735021919142718, "step": 7940}, {"loss": 0.7531, "grad_norm": 1.6824363470077515, "learning_rate": 0.0002, "epoch": 7.744763760350706, "step": 7950}, {"loss": 0.7203, "grad_norm": 1.8615055084228516, "learning_rate": 0.0002, "epoch": 7.754505601558694, "step": 7960}, {"loss": 0.7765, "grad_norm": 1.7171595096588135, "learning_rate": 0.0002, "epoch": 7.764247442766683, "step": 7970}, {"loss": 0.7633, "grad_norm": 1.9871152639389038, "learning_rate": 0.0002, "epoch": 7.773989283974672, "step": 7980}, {"loss": 0.8113, "grad_norm": 1.8975892066955566, "learning_rate": 0.0002, "epoch": 7.783731125182659, "step": 7990}, {"loss": 0.7363, "grad_norm": 1.8259385824203491, "learning_rate": 0.0002, "epoch": 7.793472966390648, "step": 8000}, {"loss": 0.7867, "grad_norm": 2.2361183166503906, "learning_rate": 0.0002, "epoch": 7.8032148075986365, "step": 8010}, {"loss": 0.7976, "grad_norm": 1.64067804813385, "learning_rate": 0.0002, "epoch": 7.812956648806624, "step": 8020}, {"loss": 0.7489, "grad_norm": 2.0037248134613037, "learning_rate": 0.0002, "epoch": 7.822698490014613, "step": 8030}, {"loss": 0.7219, "grad_norm": 1.8022961616516113, "learning_rate": 0.0002, "epoch": 7.832440331222601, "step": 8040}, {"loss": 0.7891, "grad_norm": 1.9980754852294922, "learning_rate": 0.0002, "epoch": 7.842182172430589, "step": 8050}, {"loss": 0.735, "grad_norm": 1.632716178894043, "learning_rate": 0.0002, "epoch": 7.851924013638578, "step": 8060}, {"loss": 0.7897, "grad_norm": 1.6348111629486084, "learning_rate": 0.0002, "epoch": 7.861665854846566, "step": 8070}, {"loss": 0.7585, "grad_norm": 1.968295693397522, "learning_rate": 0.0002, "epoch": 7.871407696054554, "step": 8080}, {"loss": 0.7531, "grad_norm": 1.6947685480117798, "learning_rate": 0.0002, "epoch": 7.881149537262543, "step": 8090}, {"loss": 0.7828, "grad_norm": 6.1600341796875, "learning_rate": 0.0002, "epoch": 7.890891378470531, "step": 8100}, {"loss": 0.7612, "grad_norm": 1.9334033727645874, "learning_rate": 0.0002, "epoch": 7.900633219678519, "step": 8110}, {"loss": 0.7877, "grad_norm": 1.729058027267456, "learning_rate": 0.0002, "epoch": 7.9103750608865075, "step": 8120}, {"loss": 0.788, "grad_norm": 1.8671422004699707, "learning_rate": 0.0002, "epoch": 7.920116902094496, "step": 8130}, {"loss": 0.7798, "grad_norm": 1.9794875383377075, "learning_rate": 0.0002, "epoch": 7.929858743302484, "step": 8140}, {"loss": 0.7705, "grad_norm": 1.812229037284851, "learning_rate": 0.0002, "epoch": 7.9396005845104725, "step": 8150}, {"loss": 0.8335, "grad_norm": 1.7354048490524292, "learning_rate": 0.0002, "epoch": 7.949342425718461, "step": 8160}, {"loss": 0.7849, "grad_norm": 1.7386713027954102, "learning_rate": 0.0002, "epoch": 7.959084266926449, "step": 8170}, {"loss": 0.7829, "grad_norm": 1.917111873626709, "learning_rate": 0.0002, "epoch": 7.968826108134437, "step": 8180}, {"loss": 0.7672, "grad_norm": 1.7007793188095093, "learning_rate": 0.0002, "epoch": 7.978567949342426, "step": 8190}, {"loss": 0.8181, "grad_norm": 1.8241386413574219, "learning_rate": 0.0002, "epoch": 7.988309790550414, "step": 8200}]}